From 0d599e8b825e0a7fe08f9ea130b85500b5123159 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Tue, 2 Apr 2024 14:46:06 +0200 Subject: [PATCH 01/54] try to improve particle exchange (CPU for now) --- doc/Sphinx/implementation.rst | 2 +- src/ParticleBC/BoundaryConditionType.cpp | 30 +- src/Particles/Particles.cpp | 118 ++++- src/Particles/Particles.h | 4 + src/Patch/Patch.cpp | 534 ++++++++++------------- src/Patch/Patch.h | 4 +- src/Patch/SyncVectorPatch.cpp | 60 ++- src/Patch/SyncVectorPatch.h | 7 +- src/Patch/VectorPatch.cpp | 22 +- src/Patch/VectorPatch.h | 4 +- src/Smilei.cpp | 2 +- src/SmileiMPI/AsyncMPIbuffers.cpp | 22 +- src/SmileiMPI/AsyncMPIbuffers.h | 12 +- src/Species/Species.cpp | 37 +- src/Species/SpeciesV.cpp | 48 +- src/Species/SpeciesVAdaptive.cpp | 10 +- src/Tools/Timers.cpp | 2 +- 17 files changed, 478 insertions(+), 440 deletions(-) diff --git a/doc/Sphinx/implementation.rst b/doc/Sphinx/implementation.rst index 46bf953e9..0d35165b2 100644 --- a/doc/Sphinx/implementation.rst +++ b/doc/Sphinx/implementation.rst @@ -547,7 +547,7 @@ file ``Smilei.cpp`` thought calls to different ``vecPatches`` methods. .. code-block:: c++ - vecPatches.finalizeAndSortParticles( params, &smpi, simWindow, + vecPatches.finalizeExchParticlesAndSort( params, &smpi, simWindow, time_dual, timers, itime ); * **Particle merging**: merging process for particles (still experimental) diff --git a/src/ParticleBC/BoundaryConditionType.cpp b/src/ParticleBC/BoundaryConditionType.cpp index 318b6b289..5a55d74b2 100755 --- a/src/ParticleBC/BoundaryConditionType.cpp +++ b/src/ParticleBC/BoundaryConditionType.cpp @@ -28,9 +28,9 @@ void internal_inf( Species *species, int imin, int imax, int direction, double l cell_keys /* [imin:imax - imin] */ ) #pragma omp teams distribute parallel for #endif - for (int ipart=imin ; ipart= 0 && position[ ipart ] < limit_inf ) { + cell_keys[ ipart ] = -2 - 2 * direction; } } } @@ -50,9 +50,9 @@ void internal_sup( Species *species, int imin, int imax, int direction, double l cell_keys /* [imin:imax - imin] */ ) #pragma omp teams distribute parallel for #endif - for (int ipart=imin ; ipart= limit_sup) { - cell_keys[ ipart ] = -1; + for( int ipart=imin ; ipart= 0 && position[ ipart ] >= limit_sup ) { + cell_keys[ ipart ] = -3 - 2 * direction; } } } @@ -63,10 +63,11 @@ void internal_inf_AM( Species *species, int imin, int imax, int /*direction*/, d double* position_y = species->particles->getPtrPosition(1); double* position_z = species->particles->getPtrPosition(2); int* cell_keys = species->particles->getPtrCellKeys(); - for (int ipart=imin ; ipart= 0 && distance2ToAxis < limit_inf2 ) { + cell_keys[ ipart ] = -4; } } } @@ -77,10 +78,11 @@ void internal_sup_AM( Species *species, int imin, int imax, int /*direction*/, d double* position_y = species->particles->getPtrPosition(1); double* position_z = species->particles->getPtrPosition(2); int* cell_keys = species->particles->getPtrCellKeys(); - for (int ipart=imin ; ipart= limit_sup*limit_sup ) { - cell_keys[ ipart ] = -1; + if( cell_keys[ ipart ] >= 0 && distance2ToAxis >= limit_sup2 ) { + cell_keys[ ipart ] = -5; } } } @@ -97,8 +99,8 @@ void reflect_particle_inf( Species *species, int imin, int imax, int direction, #pragma omp target is_device_ptr( position, momentum ) #pragma omp teams distribute parallel for #endif - for (int ipart=imin ; ipart indices, Particles &dest_parts, int dest_id ) +{ + const size_t transfer_size = indices.size(); + const size_t dest_new_size = dest_parts.size() + transfer_size; + + for( unsigned int iprop=0 ; ipropresize( dest_new_size ); + auto loc = dest_parts.double_prop_[iprop]->begin() + dest_id; + move_backward( loc, loc + transfer_size, dest_parts.double_prop_[iprop]->end() ); + // Copy data + for( size_t i = 0; i < transfer_size; i++ ) { + ( *dest_parts.double_prop_[iprop] )[dest_id+i] = ( *double_prop_[iprop] )[indices[i]]; + } + } + + for( unsigned int iprop=0 ; ipropresize( dest_new_size ); + auto loc = dest_parts.short_prop_[iprop]->begin() + dest_id; + move_backward( loc, loc + transfer_size, dest_parts.short_prop_[iprop]->end() ); + // Copy data + for( size_t i = 0; i < transfer_size; i++ ) { + ( *dest_parts.short_prop_[iprop] )[dest_id+i] = ( *short_prop_[iprop] )[indices[i]]; + } + } + + for( unsigned int iprop=0 ; ipropresize( dest_new_size ); + auto loc = dest_parts.uint64_prop_[iprop]->begin() + dest_id; + move_backward( loc, loc + transfer_size, dest_parts.uint64_prop_[iprop]->end() ); + // Copy data + for( size_t i = 0; i < transfer_size; i++ ) { + ( *dest_parts.uint64_prop_[iprop] )[dest_id+i] = ( *uint64_prop_[iprop] )[indices[i]]; + } + } +} + // --------------------------------------------------------------------------------------------------------------------- //! Make a new particle at the position of another //! cell keys not affected @@ -529,6 +573,70 @@ void Particles::eraseParticle( unsigned int ipart, unsigned int npart, bool comp } + +// --------------------------------------------------------------------------------------------------------------------- +//! Erase particles indexed by array 'indices' to dest_id in dest_parts +//! The array 'indices' must be sorted in increasing order +//! cell keys not affected +// --------------------------------------------------------------------------------------------------------------------- +void Particles::eraseParticles( vector indices ) +{ + const size_t indices_size = indices.size(); + const size_t initial_size = size(); + + if( indices_size > 0 ) { + + for( auto prop : double_prop_ ) { + // Relocate data to fill erased space + size_t j = 1, stop = ( 1 == indices_size ) ? initial_size : indices[1], to = indices[0]; + for( size_t from = indices[0]+1; from < initial_size; from++ ) { + if( from < stop ) { + ( *prop )[to] = ( *prop )[from]; + to++; + } else { + j++; + stop = ( j == indices_size ) ? initial_size : indices[j]; + } + } + // Resize + prop->resize( initial_size - indices_size ); + } + + for( auto prop : short_prop_ ) { + // Relocate data to fill erased space + size_t j = 1, stop = ( 1 == indices_size ) ? initial_size : indices[1], to = indices[0]; + for( size_t from = indices[0]+1; from < initial_size; from++ ) { + if( from < stop ) { + ( *prop )[to] = ( *prop )[from]; + to++; + } else { + j++; + stop = ( j == indices_size ) ? initial_size : indices[j]; + } + } + // Resize + prop->resize( initial_size - indices_size ); + } + + for( auto prop : uint64_prop_ ) { + // Relocate data to fill erased space + size_t j = 1, stop = ( 1 == indices_size ) ? initial_size : indices[1], to = indices[0]; + for( size_t from = indices[0]+1; from < initial_size; from++ ) { + if( from < stop ) { + ( *prop )[to] = ( *prop )[from]; + to++; + } else { + j++; + stop = ( j == indices_size ) ? initial_size : indices[j]; + } + } + // Resize + prop->resize( initial_size - indices_size ); + } + + } +} + // --------------------------------------------------------------------------------------------------------------------- // Print parameters of particle iPart // --------------------------------------------------------------------------------------------------------------------- @@ -1198,11 +1306,11 @@ void Particles::copyFromDeviceToHost() void Particles::extractParticles( Particles* particles_to_move ) { particles_to_move->clear(); - for ( int ipart=0 ; ipart indices, Particles &dest_parts, int dest_id ); //! Make a new particle at the position of another void makeParticleAt( Particles &source_particles, unsigned int ipart, double w, short q=0., double px=0., double py=0., double pz=0. ); @@ -151,6 +153,8 @@ class Particles void eraseParticle( unsigned int iPart, bool compute_cell_keys = false ); //! Suppress nPart particles from iPart void eraseParticle( unsigned int iPart, unsigned int nPart, bool compute_cell_keys = false ); + //! Suppress indexed particles + void eraseParticles( std::vector indices ); //! Suppress all particles from iPart to the end of particle array void eraseParticleTrail( unsigned int iPart, bool compute_cell_keys = false ); diff --git a/src/Patch/Patch.cpp b/src/Patch/Patch.cpp index b8ed401d9..0bf353e67 100755 --- a/src/Patch/Patch.cpp +++ b/src/Patch/Patch.cpp @@ -517,220 +517,184 @@ void Patch::updateMPIenv( SmileiMPI *smpi ) // --------------------------------------------------------------------------------------------------------------------- void Patch::cleanMPIBuffers( int ispec, Params ¶ms ) { - int ndim = params.nDim_field; + size_t ndim = params.nDim_field; + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; - for( int iDim=0 ; iDim < ndim ; iDim++ ) { + for( size_t iDim=0 ; iDim < ndim ; iDim++ ) { for( int iNeighbor=0 ; iNeighborMPI_buffer_.partRecv[iDim][iNeighbor].clear();//resize(0,ndim); - vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor].clear();//resize(0,ndim); - vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor].clear(); - //vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor].resize(0); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][iNeighbor] = 0; + buffer.partRecv[iDim][iNeighbor]->clear(); + buffer.partSend[iDim][iNeighbor]->clear(); } } } // cleanMPIBuffers // --------------------------------------------------------------------------------------------------------------------- -// Split particles Id to send in per direction and per patch neighbor dedicated buffers -// Apply periodicity if necessary +// Copy particles to be exchanged to buffers // --------------------------------------------------------------------------------------------------------------------- -void Patch::initExchParticles( int ispec, Params ¶ms ) +void Patch::copyExchParticlesToBuffers( int ispec, Params ¶ms ) { - Particles &cuParticles = ( *vecSpecies[ispec]->particles_to_move ); - int ndim = params.nDim_field; - int idim, check; -// double xmax[3]; - - for( int iDim=0 ; iDim < ndim ; iDim++ ) { - for( int iNeighbor=0 ; iNeighborMPI_buffer_.partRecv[iDim][iNeighbor].clear();//resize(0,ndim); - vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor].clear();//resize(0,ndim); - vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor].resize( 0 ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][iNeighbor] = 0; - } + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + Particles &part = *vecSpecies[ispec]->particles; + + cleanMPIBuffers( ispec, params ); + + vector> copy( 3 ); + copy[0] = { neighbor_[0][0] != MPI_PROC_NULL, neighbor_[0][1] != MPI_PROC_NULL }; + copy[1] = { neighbor_[1][0] != MPI_PROC_NULL, neighbor_[1][1] != MPI_PROC_NULL }; + if( params.nDim_field > 2 ) { + copy[2] = { neighbor_[2][0] != MPI_PROC_NULL, neighbor_[2][1] != MPI_PROC_NULL }; } - - int n_part_send = cuParticles.size(); - - int iPart; - - // Define where particles are going - //Put particles in the send buffer it belongs to. Priority to lower dimensions. - if( params.geometry != "AMcylindrical" ) { - for( int i=0 ; iMPI_buffer_.part_index_send[idim][0].push_back( iPart ); - } - //If particle is outside of the global domain (has no neighbor), it will not be put in a send buffer and will simply be deleted. - check = 1; - } else if( cuParticles.position( idim, iPart ) >= max_local_[idim] ) { - if( neighbor_[idim][1]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][1].push_back( iPart ); - } - check = 1; + if( params.geometry == "AMcylindrical" ) { + copy[0][0] = copy[0][0] && ( Pcoordinates[0]!=0 || vecSpecies[ispec]->boundary_conditions_[0][0]=="periodic" ); + copy[0][1] = copy[0][1] && ( Pcoordinates[0]!=params.number_of_patches[0]-1 || vecSpecies[ispec]->boundary_conditions_[0][1]=="periodic" ); + } + + // Loop all particles and count the outgoing ones + for( size_t ipart = 0; ipart < part.size(); ipart++ ) { + if( part.cell_keys[ipart] < -1 ) { + if( part.cell_keys[ipart] == -2 ) { + if( copy[0][0] ) { + part.copyParticle( ipart, *buffer.partSend[0][0] ); } - idim++; - } - } - } else { //if (geometry == "AMcylindrical") - double r_min2, r_max2; - r_max2 = max_local_[1] * max_local_[1] ; - r_min2 = min_local_[1] * min_local_[1] ; - for( int i=0 ; iboundary_conditions_[0][0]!="periodic" ) ) { - continue; - } - vecSpecies[ispec]->MPI_buffer_.part_index_send[0][0].push_back( iPart ); - //MESSAGE("Sending particle to the left x= " << cuParticles.position(0,iPart) << " xmin = " << min_local_[0] ); + } else if( part.cell_keys[ipart] == -3 ) { + if( copy[0][1] ) { + part.copyParticle( ipart, *buffer.partSend[0][1] ); } - //If particle is outside of the global domain (has no neighbor), it will not be put in a send buffer and will simply be deleted. - } else if( cuParticles.position( 0, iPart ) >= max_local_[0] ) { - if ( (Pcoordinates[0]==params.number_of_patches[0]-1) && ( vecSpecies[ispec]->boundary_conditions_[0][1]!="periodic" ) ) { - continue; + } else if( part.cell_keys[ipart] == -4 ) { + if( copy[1][0] ) { + part.copyParticle( ipart, *buffer.partSend[1][0] ); } - if( neighbor_[0][1]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send[0][1].push_back( iPart ); - // MESSAGE("Sending particle to the right x= " << cuParticles.position(0,iPart) << " xmax = " << max_local_[0] ); + } else if( part.cell_keys[ipart] == -5 ) { + if( copy[1][1] ) { + part.copyParticle( ipart, *buffer.partSend[1][1] ); } - } else if( cuParticles.distance2ToAxis( iPart ) < r_min2 ) { - if( neighbor_[1][0]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send[1][0].push_back( iPart ); - //MESSAGE("Sending particle to the south r= " << cuParticles.distance2ToAxis(iPart) << " rmin2 = " << r_min2 ); + } else if( part.cell_keys[ipart] == -6 ) { + if( copy[2][0] ) { + part.copyParticle( ipart, *buffer.partSend[2][0] ); } - } else if( cuParticles.distance2ToAxis( iPart ) >= r_max2 ) { - if( neighbor_[1][1]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send[1][1].push_back( iPart ); - //MESSAGE("Sending particle to the north r= " << cuParticles.distance2ToAxis(iPart) << " rmax2 = " << r_max2 << " rmin2= " << r_min2 ); + } else if( part.cell_keys[ipart] == -7 ) { + if( copy[2][1] ) { + part.copyParticle( ipart, *buffer.partSend[2][1] ); } } - } } -} // initExchParticles(... iDim) +} // copyExchParticlesToBuffers(... iDim) // --------------------------------------------------------------------------------------------------------------------- -// For direction iDim, start exchange of number of particles -// - vecPatch : used for intra-MPI process comm (direct copy using Particels::copyParticles) -// - smpi : inhereted from previous SmileiMPI::exchangeParticles() +// Exchange number of particles to exchange to establish or not a communication // --------------------------------------------------------------------------------------------------------------------- void Patch::exchNbrOfParticles( SmileiMPI *smpi, int ispec, Params &, int iDim, VectorPatch *vecPatch ) { - int h0 = ( *vecPatch )( 0 )->hindex; - /********************************************************************************/ - // Exchange number of particles to exchange to establish or not a communication - /********************************************************************************/ + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + for( int iNeighbor=0 ; iNeighborsize(); + + // Send number of particles from neighbor if( neighbor_[iDim][iNeighbor]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send_sz[iDim][iNeighbor] = ( vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor] ).size(); - if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { - //If neighbour is MPI ==> I send him the number of particles I'll send later. int local_hindex = hindex - vecPatch->refHindex_; int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); - MPI_Isend( &( vecSpecies[ispec]->MPI_buffer_.part_index_send_sz[iDim][iNeighbor] ), 1, MPI_INT, MPI_neighbor_[iDim][iNeighbor], tag, MPI_COMM_WORLD, &( vecSpecies[ispec]->MPI_buffer_.srequest[iDim][iNeighbor] ) ); + MPI_Isend( &buffer.partSendSize[iDim][iNeighbor], 1, MPI_INT, MPI_neighbor_[iDim][iNeighbor], tag, MPI_COMM_WORLD, &buffer.srequest[iDim][iNeighbor] ); } else { - //Else, I directly set the receive size to the correct value. - ( *vecPatch )( neighbor_[iDim][iNeighbor]- h0 )->vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2] = vecSpecies[ispec]->MPI_buffer_.part_index_send_sz[iDim][iNeighbor]; + // If the destination is in the same MPI, directly set the number at destination + int destination_hindex = neighbor_[iDim][iNeighbor] - vecPatch->refHindex_; + SpeciesMPIbuffers &destination_buffer = ( *vecPatch )( destination_hindex )->vecSpecies[ispec]->MPI_buffer_; + destination_buffer.partRecvSize[iDim][iOppositeNeighbor] = buffer.partSendSize[iDim][iNeighbor]; } - } // END of Send - - if( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) { - if( is_a_MPI_neighbor( iDim, ( iNeighbor+1 )%2 ) ) { - //If other neighbour is MPI ==> I receive the number of particles I'll receive later. - int local_hindex = neighbor_[iDim][( iNeighbor+1 )%2] - smpi->patch_refHindexes[ MPI_neighbor_[iDim][( iNeighbor+1 )%2] ]; + } + + // Receive number of particles from neighbor + if( neighbor_[iDim][iOppositeNeighbor]!=MPI_PROC_NULL ) { + if( is_a_MPI_neighbor( iDim, iOppositeNeighbor ) ) { + int local_hindex = neighbor_[iDim][iOppositeNeighbor] - smpi->patch_refHindexes[ MPI_neighbor_[iDim][iOppositeNeighbor] ]; int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); - MPI_Irecv( &( vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2] ), 1, MPI_INT, MPI_neighbor_[iDim][( iNeighbor+1 )%2], tag, MPI_COMM_WORLD, &( vecSpecies[ispec]->MPI_buffer_.rrequest[iDim][( iNeighbor+1 )%2] ) ); + MPI_Irecv( &buffer.partRecvSize[iDim][iOppositeNeighbor], 1, MPI_INT, MPI_neighbor_[iDim][iOppositeNeighbor], tag, MPI_COMM_WORLD, &buffer.rrequest[iDim][iOppositeNeighbor] ); } } - }//end loop on nb_neighbors. - + + } + } // exchNbrOfParticles(... iDim) +// --------------------------------------------------------------------------------------------------------------------- +// Wait for end of communications over number of particles +// --------------------------------------------------------------------------------------------------------------------- void Patch::endNbrOfParticles( int ispec, int iDim ) { - Particles &cuParticles = ( *vecSpecies[ispec]->particles_to_move ); - - /********************************************************************************/ - // Wait for end of communications over number of particles - /********************************************************************************/ + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + for( int iNeighbor=0 ; iNeighborMPI_buffer_.srequest[iDim][iNeighbor] ), &( sstat[iNeighbor] ) ); - } + int iOppositeNeighbor = ( iNeighbor+1 )%2; + + MPI_Status sstat[2]; + MPI_Status rstat[2]; + if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { + MPI_Wait( &( buffer.srequest[iDim][iNeighbor] ), &( sstat[iNeighbor] ) ); } - if( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) { - if( is_a_MPI_neighbor( iDim, ( iNeighbor+1 )%2 ) ) { - MPI_Wait( &( vecSpecies[ispec]->MPI_buffer_.rrequest[iDim][( iNeighbor+1 )%2] ), &( rstat[( iNeighbor+1 )%2] ) ); - if( vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]!=0 ) { - //If I receive particles over MPI, I initialize my receive buffer with the appropriate size. - vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2].initialize( vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2], cuParticles ); - } - } + if( is_a_MPI_neighbor( iDim, iOppositeNeighbor ) ) { + MPI_Wait( &( buffer.rrequest[iDim][iOppositeNeighbor] ), &( rstat[iOppositeNeighbor] ) ); } } - } // END endNbrOfParticles(... iDim) // --------------------------------------------------------------------------------------------------------------------- -// For direction iDim, finalize receive of number of particles and really send particles +// For direction iDim, prepare particles to be sent // - vecPatch : used for intra-MPI process comm (direct copy using Particels::copyParticles) // - smpi : used smpi->periods_ // --------------------------------------------------------------------------------------------------------------------- void Patch::prepareParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iDim, VectorPatch *vecPatch ) { - Particles &cuParticles = ( *vecSpecies[ispec]->particles_to_move ); - - int n_part_send; - int h0 = ( *vecPatch )( 0 )->hindex; double x_max = params.cell_length[iDim]*( params.global_size_[iDim] ); - + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + for( int iNeighbor=0 ; iNeighborMPI_buffer_.part_index_send[iDim][iNeighbor] ).size(); - if( ( neighbor_[iDim][iNeighbor]!=MPI_PROC_NULL ) && ( n_part_send!=0 ) ) { - // Enabled periodicity + + Particles &partSend = *buffer.partSend[iDim][iNeighbor]; + + // Enabled periodicity + if( neighbor_[iDim][iNeighbor] != MPI_PROC_NULL && partSend.size() != 0 ) { if( smpi->periods_[iDim]==1 ) { - for( int iPart=0 ; iPartMPI_buffer_.part_index_send[iDim][iNeighbor][iPart] ) < 0. ) ) { - cuParticles.position( iDim, vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor][iPart] ) += x_max; - } else if( ( iNeighbor==1 ) && ( Pcoordinates[iDim] == params.number_of_patches[iDim]-1 ) && ( cuParticles.position( iDim, vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor][iPart] ) >= x_max ) ) { - cuParticles.position( iDim, vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor][iPart] ) -= x_max; + if( iNeighbor == 0 && Pcoordinates[iDim] == 0 ) { + for( size_t iPart=0; iPart < partSend.size(); iPart++ ) { + if( partSend.position( iDim, iPart ) < 0. ) { + partSend.position( iDim, iPart ) += x_max; + } + } + } + if( iNeighbor == 1 && Pcoordinates[iDim] == params.number_of_patches[iDim]-1 ) { + for( size_t iPart=0; iPart < partSend.size(); iPart++ ) { + if( partSend.position( iDim, iPart ) >= x_max ) { + partSend.position( iDim, iPart ) -= x_max; + } } } } - // Send particles + } + + if( neighbor_[iDim][iNeighbor] != MPI_PROC_NULL ) { + // Initialize receive buffer with the appropriate size if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { - // If MPI comm, first copy particles in the sendbuffer - for( int iPart=0 ; iPartMPI_buffer_.part_index_send[iDim][iNeighbor][iPart], vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor] ); - } - } else { - //If not MPI comm, copy particles directly in the receive buffer - for( int iPart=0 ; iPartMPI_buffer_.part_index_send[iDim][iNeighbor][iPart], ( ( *vecPatch )( neighbor_[iDim][iNeighbor]- h0 )->vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ) ); + if( buffer.partRecvSize[iDim][iNeighbor]!=0 ) { + buffer.partRecv[iDim][iNeighbor]->initialize( buffer.partRecvSize[iDim][iNeighbor], *vecSpecies[ispec]->particles ); } } - } // END of Send - + // Swap particles to other patch directly if it belongs to the same MPI + else { + int iOppositeNeighbor = ( iNeighbor+1 )%2; + SpeciesMPIbuffers &neighbor_buffer = ( *vecPatch )( neighbor_[iDim][iNeighbor]- vecPatch->refHindex_ )->vecSpecies[ispec]->MPI_buffer_; + swap( buffer.partSend[iDim][iNeighbor], neighbor_buffer.partRecv[iDim][iOppositeNeighbor] ); + } + } + } // END for iNeighbor } // END prepareParticles(... iDim) @@ -738,169 +702,135 @@ void Patch::prepareParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iD void Patch::exchParticles( SmileiMPI *smpi, int ispec, Params &, int iDim, VectorPatch *vecPatch ) { - int n_part_send, n_part_recv; - - for( int iNeighbor=0 ; iNeighborMPI_buffer_.part_index_send[iDim][iNeighbor] ).size(); - if( ( neighbor_[iDim][iNeighbor]!=MPI_PROC_NULL ) && ( n_part_send!=0 ) ) { - // Send particles - if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { - // Then send particles - int local_hindex = hindex - vecPatch->refHindex_; - int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); - vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor] = smpi->createMPIparticles( &( vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor] ) ); - MPI_Isend( &( ( vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor] ).position( 0, 0 ) ), 1, vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor], MPI_neighbor_[iDim][iNeighbor], tag, MPI_COMM_WORLD, &( vecSpecies[ispec]->MPI_buffer_.srequest[iDim][iNeighbor] ) ); - } - } // END of Send - - n_part_recv = vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]; - if( ( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) && ( n_part_recv!=0 ) ) { - if( is_a_MPI_neighbor( iDim, ( iNeighbor+1 )%2 ) ) { - // If MPI comm, receive particles in the recv buffer previously initialized. - vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor] = smpi->createMPIparticles( &( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ) ); - int local_hindex = neighbor_[iDim][( iNeighbor+1 )%2] - smpi->patch_refHindexes[ MPI_neighbor_[iDim][( iNeighbor+1 )%2] ]; - int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); - MPI_Irecv( &( ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).position( 0, 0 ) ), 1, vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor], MPI_neighbor_[iDim][( iNeighbor+1 )%2], tag, MPI_COMM_WORLD, &( vecSpecies[ispec]->MPI_buffer_.rrequest[iDim][( iNeighbor+1 )%2] ) ); - } - - } // END of Recv - - } // END for iNeighbor - + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + + for( int iNeighbor=0; iNeighborsize()<<" n_recv "<size()); + // Send + Particles &partSend = *buffer.partSend[iDim][iNeighbor]; + if( partSend.size() != 0 && is_a_MPI_neighbor( iDim, iNeighbor ) ) { + int local_hindex = hindex - vecPatch->refHindex_; + int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); + vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor] = smpi->createMPIparticles( &partSend ); + MPI_Isend( &partSend.position( 0, 0 ), 1, vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor], MPI_neighbor_[iDim][iNeighbor], tag, MPI_COMM_WORLD, &( buffer.srequest[iDim][iNeighbor] ) ); + } + + // Receive + int iOppositeNeighbor = ( iNeighbor+1 )%2; + Particles &partRecv = *buffer.partRecv[iDim][iOppositeNeighbor]; + if( partRecv.size() != 0 && is_a_MPI_neighbor( iDim, iOppositeNeighbor ) ) { + // MESSAGE(" patch "<typePartRecv[( iDim*2 )+iNeighbor] = smpi->createMPIparticles( &partRecv ); + int local_hindex = neighbor_[iDim][iOppositeNeighbor] - smpi->patch_refHindexes[ MPI_neighbor_[iDim][iOppositeNeighbor] ]; + int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); + MPI_Irecv( &partRecv.position( 0, 0 ), 1, vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor], MPI_neighbor_[iDim][iOppositeNeighbor], tag, MPI_COMM_WORLD, &buffer.rrequest[iDim][iOppositeNeighbor] ); + } + + } + } // END exchParticles(... iDim) // --------------------------------------------------------------------------------------------------------------------- -// For direction iDim, finalize receive of particles, temporary store particles if diagonalParticles -// And store recv particles at their definitive place. -// Call Patch::cleanupSentParticles -// - vecPatch : used for intra-MPI process comm (direct copy using Particels::copyParticles) -// - smpi : used smpi->periods_ +// For direction iDim, wait receive of particles // --------------------------------------------------------------------------------------------------------------------- -void Patch::finalizeExchParticles( int ispec, int iDim ) +void Patch::waitExchParticles( int ispec, int iDim ) { - - int n_part_send, n_part_recv; - - /********************************************************************************/ - // Wait for end of communications over Particles - /********************************************************************************/ + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + for( int iNeighbor=0 ; iNeighborMPI_buffer_.part_index_send[iDim][iNeighbor].size(); - n_part_recv = vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]; - - if( ( neighbor_[iDim][iNeighbor]!=MPI_PROC_NULL ) && ( n_part_send!=0 ) ) { - if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { - MPI_Wait( &( vecSpecies[ispec]->MPI_buffer_.srequest[iDim][iNeighbor] ), &( sstat[iNeighbor] ) ); - MPI_Type_free( &( vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor] ) ); - } + + int iOppositeNeighbor = ( iNeighbor+1 )%2; + Particles &partSend = *buffer.partSend[iDim][iNeighbor]; + Particles &partRecv = *buffer.partRecv[iDim][iOppositeNeighbor]; + + if( partSend.size() != 0 && is_a_MPI_neighbor( iDim, iNeighbor ) ) { + MPI_Wait( &buffer.srequest[iDim][iNeighbor], &sstat[iNeighbor] ); + MPI_Type_free( &vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor] ); } - if( ( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) && ( n_part_recv!=0 ) ) { - if( is_a_MPI_neighbor( iDim, ( iNeighbor+1 )%2 ) ) { - MPI_Wait( &( vecSpecies[ispec]->MPI_buffer_.rrequest[iDim][( iNeighbor+1 )%2] ), &( rstat[( iNeighbor+1 )%2] ) ); - MPI_Type_free( &( vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor] ) ); - } + if( partRecv.size() != 0 && is_a_MPI_neighbor( iDim, iOppositeNeighbor ) ) { + MPI_Wait( &buffer.rrequest[iDim][iOppositeNeighbor], &rstat[iOppositeNeighbor] ); + MPI_Type_free( &vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor] ); } } } void Patch::cornersParticles( int ispec, Params ¶ms, int iDim ) { - int ndim = params.nDim_field; - int idim, check; - - Particles &cuParticles = ( *vecSpecies[ispec]->particles_to_move ); - - int n_part_recv; - - /********************************************************************************/ - // Wait for end of communications over Particles - /********************************************************************************/ + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + + // No need to treat diag particles at last dimension + if( iDim == ndim-1 ) { + return; + } + for( int iNeighbor=0 ; iNeighborMPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]; - - if( ( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) && ( n_part_recv!=0 ) ) { - - // Treat diagonalParticles - if( iDim < ndim-1 ) { // No need to treat diag particles at last dimension. - if( params.geometry != "AMcylindrical" ) { - for( int iPart=n_part_recv-1 ; iPart>=0; iPart-- ) { - check = 0; - idim = iDim+1;//We check next dimension - while( check == 0 && idimMPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).position( idim, iPart ) < min_local_[idim] ) { - if( neighbor_[idim][0]!=MPI_PROC_NULL ) { //if neighbour exists - //... copy it at the back of the local particle vector ... - ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).copyParticle( iPart, cuParticles ); - //...adjust particles->last_index or cell_keys ... - //vecSpecies[ispec]->addSpaceForOneParticle(); - //... and add its index to the particles to be sent later... - vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][0].push_back( cuParticles.size()-1 ); - } - //Remove it from receive buffer. - ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).eraseParticle( iPart ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]--; - check = 1; - } - //Other side of idim - else if( ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).position( idim, iPart ) >= max_local_[idim] ) { - if( neighbor_[idim][1]!=MPI_PROC_NULL ) { //if neighbour exists - ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).copyParticle( iPart, cuParticles ); - //...adjust particles->last_index or cell_keys ... - //vecSpecies[ispec]->addSpaceForOneParticle(); - vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][1].push_back( cuParticles.size()-1 ); - } - ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).eraseParticle( iPart ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]--; - check = 1; - } - idim++; + + Particles &partRecv = *buffer.partRecv[iDim][iNeighbor]; + + vector> indices_corner_min( ndim-iDim-1 ); + vector> indices_corner_max( ndim-iDim-1 ); + vector indices_all_corners; + + if( neighbor_[iDim][iNeighbor] != MPI_PROC_NULL && partRecv.size() != 0 ) { + + // Find corner particles and store their indices + if( params.geometry != "AMcylindrical" ) { + + for( size_t iPart = 0; iPart < partRecv.size(); iPart++ ) { + for( size_t otherDim = iDim+1; otherDim < (size_t) ndim; otherDim++ ) { + if( partRecv.position( otherDim, iPart ) < min_local_[otherDim] ) { + indices_corner_min[otherDim-iDim-1].push_back( iPart ); + indices_all_corners.push_back( iPart ); + break; + } else if( partRecv.position( otherDim, iPart ) >= max_local_[otherDim] ) { + indices_corner_max[otherDim-iDim-1].push_back( iPart ); + indices_all_corners.push_back( iPart ); + break; } } - } else { //In AM geometry - //In this case, iDim = 0 and idim = iDim + 1 = 1. We only have to check potential comms along R. - double r_min2, r_max2; - r_min2 = min_local_[1]*min_local_[1]; - r_max2 = max_local_[1]*max_local_[1]; - for( int iPart=n_part_recv-1 ; iPart>=0; iPart-- ) { - //MESSAGE("test particle diag r2 = " << (vecSpecies[ispec]->MPI_buffer_.partRecv[0][(iNeighbor+1)%2]).distance2ToAxis(iPart) << "rmin2 = " << r_min2 << " rmax2 = " << r_max2 ); - if( ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).distance2ToAxis( iPart ) < r_min2 ) { - if( neighbor_[1][0]!=MPI_PROC_NULL ) { //if neighbour exists - //... copy it at the back of the local particle vector ... - ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).copyParticle( iPart, cuParticles ); - //...adjust particles->last_index or cell_keys ... - //vecSpecies[ispec]->addSpaceForOneParticle(); - //... and add its index to the particles to be sent later... - vecSpecies[ispec]->MPI_buffer_.part_index_send[1][0].push_back( cuParticles.size()-1 ); - //..without forgeting to add it to the list of particles to clean. - } - //Remove it from receive buffer. - ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).eraseParticle( iPart ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[0][( iNeighbor+1 )%2]--; - } - //Other side of idim - else if( ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).distance2ToAxis( iPart ) >= r_max2 ) { - if( neighbor_[1][1]!=MPI_PROC_NULL ) { //if neighbour exists - //MESSAGE("particle diag +R"); - ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).copyParticle( iPart, cuParticles ); - //...adjust particles->last_index or cell_keys ... - //vecSpecies[ispec]->addSpaceForOneParticle(); - vecSpecies[ispec]->MPI_buffer_.part_index_send[1][1].push_back( cuParticles.size()-1 ); - } - ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).eraseParticle( iPart ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[0][( iNeighbor+1 )%2]--; - } + } + + } else { //In AM geometry + + //In this case, iDim = 0 and idim = iDim + 1 = 1. We only have to check potential comms along R. + double r_min2 = min_local_[1]*min_local_[1]; + double r_max2 = max_local_[1]*max_local_[1]; + + for( size_t iPart = 0; iPart < partRecv.size(); iPart++ ) { + if( partRecv.distance2ToAxis( iPart ) < r_min2 ) { + indices_corner_min[0].push_back( iPart ); + indices_all_corners.push_back( iPart ); + break; + } else if( partRecv.distance2ToAxis( iPart ) >= r_max2 ) { + indices_corner_max[0].push_back( iPart ); + indices_all_corners.push_back( iPart ); + break; } } - }//If not last dim for diagonal particles. + + } + + // Copy corner particles to the start or the end of the particles to be sent for the following dimension + for( size_t otherDim = iDim+1; otherDim < (size_t) ndim; otherDim++ ) { + if( indices_corner_min[otherDim-iDim-1].size() > 0 && neighbor_[otherDim][0] != MPI_PROC_NULL ) { + partRecv.copyParticles( indices_corner_min[otherDim-iDim-1], *buffer.partSend[otherDim][0], 0 ); + } + if( indices_corner_max[otherDim-iDim-1].size() > 0 && neighbor_[otherDim][1] != MPI_PROC_NULL ) { + partRecv.copyParticles( indices_corner_max[otherDim-iDim-1], *buffer.partSend[otherDim][1], buffer.partSend[otherDim][1]->size() ); + } + } + + // Erase corner particles from the current recv array + if( indices_all_corners.size() > 0 ) { + partRecv.eraseParticles( indices_all_corners ); + } + } //If received something } //loop i Neighbor } @@ -925,22 +855,20 @@ void Patch::importAndSortParticles( int ispec, Params ¶ms ) void Patch::cleanParticlesOverhead( Params ¶ms ) { - int ndim = params.nDim_field; + for( unsigned int ispec=0 ; ispecparticles ); - - for( int idim = 0; idim < ndim; idim++ ) { + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + + for( size_t idim = 0; idim < params.nDim_field; idim++ ) { for( int iNeighbor=0 ; iNeighborMPI_buffer_.partRecv[idim][iNeighbor].clear(); - vecSpecies[ispec]->MPI_buffer_.partRecv[idim][iNeighbor].shrinkToFit( ); - vecSpecies[ispec]->MPI_buffer_.partSend[idim][iNeighbor].clear(); - vecSpecies[ispec]->MPI_buffer_.partSend[idim][iNeighbor].shrinkToFit( ); - vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][iNeighbor].clear(); - vector( vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][iNeighbor] ).swap( vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][iNeighbor] ); + buffer.partRecv[idim][iNeighbor]->clear(); + buffer.partRecv[idim][iNeighbor]->shrinkToFit( ); + buffer.partSend[idim][iNeighbor]->clear(); + buffer.partSend[idim][iNeighbor]->shrinkToFit( ); } } - - cuParticles.shrinkToFit( ); + + vecSpecies[ispec]->particles->shrinkToFit( ); } } diff --git a/src/Patch/Patch.h b/src/Patch/Patch.h index 6fc3f7578..ff5a76a5c 100755 --- a/src/Patch/Patch.h +++ b/src/Patch/Patch.h @@ -174,7 +174,7 @@ class Patch //! Clean the MPI buffers for communications void cleanMPIBuffers( int ispec, Params ¶ms ); //! manage Idx of particles per direction, - void initExchParticles( int ispec, Params ¶ms ); + void copyExchParticlesToBuffers( int ispec, Params ¶ms ); //! init comm nbr of particles void exchNbrOfParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iDim, VectorPatch *vecPatch ); //! finalize comm / nbr of particles, init exch / particles @@ -184,7 +184,7 @@ class Patch //! effective exchange of particles void exchParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iDim, VectorPatch *vecPatch ); //! finalize exch / particles - void finalizeExchParticles( int ispec, int iDim ); + void waitExchParticles( int ispec, int iDim ); //! Treat diagonalParticles void cornersParticles( int ispec, Params ¶ms, int iDim ); //! inject particles received in main data structure and particles sorting diff --git a/src/Patch/SyncVectorPatch.cpp b/src/Patch/SyncVectorPatch.cpp index 09817b201..675529113 100755 --- a/src/Patch/SyncVectorPatch.cpp +++ b/src/Patch/SyncVectorPatch.cpp @@ -24,26 +24,15 @@ template void SyncVectorPatch::exchangeAlongAllDirections,cField template void SyncVectorPatch::exchangeAlongAllDirectionsNoOMP( std::vector fields, VectorPatch &vecPatches, SmileiMPI *smpi ); template void SyncVectorPatch::exchangeAlongAllDirectionsNoOMP,cField>( std::vector fields, VectorPatch &vecPatches, SmileiMPI *smpi ); -void SyncVectorPatch::exchangeParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ) +void SyncVectorPatch::initExchParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ) { #pragma omp for schedule(runtime) for( unsigned int ipatch=0 ; ipatchextractParticles(); - vecPatches( ipatch )->initExchParticles( ispec, params ); - } - - // Init comm in direction 0 -#ifndef _NO_MPI_TM - #pragma omp for schedule(runtime) -#else - #pragma omp single -#endif - for( unsigned int ipatch=0 ; ipatchexchNbrOfParticles( smpi, ispec, params, 0, &vecPatches ); + vecPatches( ipatch )->copyExchParticlesToBuffers( ispec, params ); } + + // Start exchange along dimension 0 only + SyncVectorPatch::initExchParticlesAlongDimension( vecPatches, ispec, 0, params, smpi ); } // --------------------------------------------------------------------------------------------------------------------- @@ -52,24 +41,17 @@ void SyncVectorPatch::exchangeParticles( VectorPatch &vecPatches, int ispec, Par //! - the importation of the new particles in the particle property arrays //! - the sorting of particles // --------------------------------------------------------------------------------------------------------------------- -void SyncVectorPatch::finalizeAndSortParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ) +void SyncVectorPatch::finalizeExchParticlesAndSort( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ) { - SyncVectorPatch::finalizeExchangeParticles( vecPatches, ispec, 0, params, smpi ); - - // Per direction + // finish exchange along dimension 0 only + SyncVectorPatch::finalizeExchParticlesAlongDimension( vecPatches, ispec, 0, params, smpi ); + + // Other directions for( unsigned int iDim=1 ; iDimexchNbrOfParticles( smpi, ispec, params, iDim, &vecPatches ); - } - - SyncVectorPatch::finalizeExchangeParticles( vecPatches, ispec, iDim, params, smpi ); + SyncVectorPatch::initExchParticlesAlongDimension( vecPatches, ispec, iDim, params, smpi ); + SyncVectorPatch::finalizeExchParticlesAlongDimension( vecPatches, ispec, iDim, params, smpi ); } - + #pragma omp for schedule(runtime) for( unsigned int ipatch=0 ; ipatchimportAndSortParticles( ispec, params ); @@ -108,8 +90,20 @@ void SyncVectorPatch::finalizeAndSortParticles( VectorPatch &vecPatches, int isp } +void SyncVectorPatch::initExchParticlesAlongDimension( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ) +{ + // Exchange numbers of particles in direction 0 only +#ifndef _NO_MPI_TM + #pragma omp for schedule(runtime) +#else + #pragma omp single +#endif + for( unsigned int ipatch=0 ; ipatchexchNbrOfParticles( smpi, ispec, params, iDim, &vecPatches ); + } +} -void SyncVectorPatch::finalizeExchangeParticles( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ) +void SyncVectorPatch::finalizeExchParticlesAlongDimension( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ) { #ifndef _NO_MPI_TM #pragma omp for schedule(runtime) @@ -140,7 +134,7 @@ void SyncVectorPatch::finalizeExchangeParticles( VectorPatch &vecPatches, int is #pragma omp single #endif for( unsigned int ipatch=0 ; ipatchfinalizeExchParticles( ispec, iDim ); + vecPatches( ipatch )->waitExchParticles( ispec, iDim ); } #pragma omp for schedule(runtime) diff --git a/src/Patch/SyncVectorPatch.h b/src/Patch/SyncVectorPatch.h index 0ce868cae..0322c1283 100755 --- a/src/Patch/SyncVectorPatch.h +++ b/src/Patch/SyncVectorPatch.h @@ -17,9 +17,10 @@ class SyncVectorPatch public : //! Particles synchronization - static void exchangeParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ); - static void finalizeAndSortParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ); - static void finalizeExchangeParticles( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ); + static void initExchParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ); + static void finalizeExchParticlesAndSort( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ); + static void initExchParticlesAlongDimension( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ); + static void finalizeExchParticlesAlongDimension( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ); //! Densities synchronization static void sumRhoJ( Params ¶ms, VectorPatch &vecPatches, SmileiMPI *smpi ); diff --git a/src/Patch/VectorPatch.cpp b/src/Patch/VectorPatch.cpp index 0c2fbb036..22d976ba2 100755 --- a/src/Patch/VectorPatch.cpp +++ b/src/Patch/VectorPatch.cpp @@ -322,7 +322,7 @@ void VectorPatch::initialParticleSorting( Params ¶ms ) } // --------------------------------------------------------------------------------------------------------------------- -// For all patches, move particles (restartRhoJ(s), dynamics and exchangeParticles) +// For all patches, move particles (restartRhoJ(s), dynamics and initExchParticles) // --------------------------------------------------------------------------------------------------------------------- void VectorPatch::dynamics( Params ¶ms, SmileiMPI *smpi, @@ -402,7 +402,7 @@ void VectorPatch::dynamics( Params ¶ms, for( unsigned int ispec=0 ; ispec<( *this )( 0 )->vecSpecies.size(); ispec++ ) { Species *spec = species( 0, ispec ); if ( (!params.Laser_Envelope_model) && (spec->isProj( time_dual, simWindow )) ){ - SyncVectorPatch::exchangeParticles( ( *this ), ispec, params, smpi ); // Included sortParticles + SyncVectorPatch::initExchParticles( ( *this ), ispec, params, smpi ); // Included sortParticles } // end condition on Species and on envelope model } // end loop on species //MESSAGE("exchange particles"); @@ -460,7 +460,7 @@ void VectorPatch::projectionForDiags( Params ¶ms, // --------------------------------------------------------------------------------------------------------------------- //! For all patches, exchange particles and sort them. // --------------------------------------------------------------------------------------------------------------------- -void VectorPatch::finalizeAndSortParticles( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, +void VectorPatch::finalizeExchParticlesAndSort( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, double time_dual, Timers &timers, int itime ) { timers.syncPart.restart(); @@ -471,7 +471,7 @@ void VectorPatch::finalizeAndSortParticles( Params ¶ms, SmileiMPI *smpi, Sim for( unsigned int ispec=0 ; ispec<( *this )( 0 )->vecSpecies.size(); ispec++ ) { if( ( *this )( 0 )->vecSpecies[ispec]->isProj( time_dual, simWindow ) ) { - SyncVectorPatch::finalizeAndSortParticles( ( *this ), ispec, params, smpi ); // Included sortParticles + SyncVectorPatch::finalizeExchParticlesAndSort( ( *this ), ispec, params, smpi ); // Included sortParticles } } @@ -491,7 +491,7 @@ void VectorPatch::finalizeAndSortParticles( Params ¶ms, SmileiMPI *smpi, Sim timers.syncPart.update( params.printNow( itime ) ); -} // END finalizeAndSortParticles +} // END finalizeExchParticlesAndSort //! Perform the particles merging on all patches @@ -3030,7 +3030,7 @@ void VectorPatch::createPatches( Params ¶ms, SmileiMPI *smpi, SimWindow *sim // Set Index of the 1st patch of the vector yet on current MPI rank // Is this really necessary ? It should be done already ... - refHindex_ = ( *this )( 0 )->Hindex(); + setRefHindex(); // Current number of patch int nPatches_now = this->size() ; @@ -4645,7 +4645,7 @@ void VectorPatch::ponderomotiveUpdatePositionAndCurrents( Params ¶ms, timers.syncPart.restart(); for( unsigned int ispec=0 ; ispec<( *this )( 0 )->vecSpecies.size(); ispec++ ) { if( ( *this )( 0 )->vecSpecies[ispec]->isProj( time_dual, simWindow ) ) { - SyncVectorPatch::exchangeParticles( ( *this ), ispec, params, smpi ); // Included sortParticles + SyncVectorPatch::initExchParticles( ( *this ), ispec, params, smpi ); // Included sortParticles } // end condition on species } // end loop on species timers.syncPart.update( params.printNow( itime ) ); @@ -5421,7 +5421,7 @@ void VectorPatch::dynamicsWithTasks( Params ¶ms, Species *spec_task = species( ipatch, ispec ); for( unsigned int scell = 0 ; scell < spec_task->Ncells ; scell++ ) { for( unsigned int iPart=spec_task->particles->first_index[scell] ; ( int )iPartparticles->last_index[scell]; iPart++ ) { - if ( spec_task->particles->cell_keys[iPart] != -1 ) { + if ( spec_task->particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. spec_task->count[spec_task->particles->cell_keys[iPart]] ++; } @@ -5437,7 +5437,7 @@ void VectorPatch::dynamicsWithTasks( Params ¶ms, Species *spec_task = species( ipatch, ispec ); for( unsigned int scell = 0 ; scell < spec_task->Ncells ; scell++ ) { for( unsigned int iPart=spec_task->particles->first_index[scell] ; ( int )iPartparticles->last_index[scell]; iPart++ ) { - if ( spec_task->particles->cell_keys[iPart] != -1 ) { + if ( spec_task->particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. spec_task->count[spec_task->particles->cell_keys[iPart]] ++; } @@ -5657,7 +5657,7 @@ void VectorPatch::ponderomotiveUpdatePositionAndCurrentsWithTasks( Params ¶m Species *spec_task = species( ipatch, ispec ); for( unsigned int scell = 0 ; scell < spec_task->Ncells ; scell++ ) { for( unsigned int iPart=spec_task->particles->first_index[scell] ; ( int )iPartparticles->last_index[scell]; iPart++ ) { - if ( spec_task->particles->cell_keys[iPart] != -1 ) { + if ( spec_task->particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. spec_task->count[spec_task->particles->cell_keys[iPart]] ++; } @@ -5675,7 +5675,7 @@ void VectorPatch::ponderomotiveUpdatePositionAndCurrentsWithTasks( Params ¶m Species *spec_task = species( ipatch, ispec ); for( unsigned int scell = 0 ; scell < spec_task->Ncells ; scell++ ) { for( unsigned int iPart=spec_task->particles->first_index[scell] ; ( int )iPartparticles->last_index[scell]; iPart++ ) { - if ( spec_task->particles->cell_keys[iPart] != -1 ) { + if ( spec_task->particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. spec_task->count[spec_task->particles->cell_keys[iPart]] ++; } diff --git a/src/Patch/VectorPatch.h b/src/Patch/VectorPatch.h index 35be9ee6b..ff1493813 100755 --- a/src/Patch/VectorPatch.h +++ b/src/Patch/VectorPatch.h @@ -138,7 +138,7 @@ public : //! Particle sorting for all patches. This is done at initialization time. void initialParticleSorting( Params ¶ms ); - //! For all patch, move particles (restartRhoJ(s), dynamics and exchangeParticles) + //! For all patch, move particles (restartRhoJ(s), dynamics and initExchParticles) void dynamics( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, @@ -157,7 +157,7 @@ public : Timers &timers, int itime ); //! For all patches, exchange particles and sort them. - void finalizeAndSortParticles( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, + void finalizeExchParticlesAndSort( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, double time_dual, Timers &timers, int itime ); void finalizeSyncAndBCFields( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, diff --git a/src/Smilei.cpp b/src/Smilei.cpp index 15cd7b047..0ab0db1a2 100755 --- a/src/Smilei.cpp +++ b/src/Smilei.cpp @@ -629,7 +629,7 @@ int main( int argc, char *argv[] ) #pragma omp parallel shared (time_dual,smpi,params, vecPatches, region, simWindow, checkpoint, itime) { // finalize particle exchanges and sort particles - vecPatches.finalizeAndSortParticles( params, &smpi, simWindow, + vecPatches.finalizeExchParticlesAndSort( params, &smpi, simWindow, time_dual, timers, itime ); // Particle merging diff --git a/src/SmileiMPI/AsyncMPIbuffers.cpp b/src/SmileiMPI/AsyncMPIbuffers.cpp index 0f7cebe9d..a5a53dbb0 100755 --- a/src/SmileiMPI/AsyncMPIbuffers.cpp +++ b/src/SmileiMPI/AsyncMPIbuffers.cpp @@ -66,6 +66,12 @@ SpeciesMPIbuffers::SpeciesMPIbuffers() SpeciesMPIbuffers::~SpeciesMPIbuffers() { + for( size_t i=0 ; i > partRecv; + std::vector< std::vector > partRecv; //! ndim vectors of 2 received packets of particles (1 per direction) - std::vector< std::vector > partSend; + std::vector< std::vector > partSend; - //! ndim vectors of 2 vectors of index particles to send (1 per direction) - //! - not sent - // - used to sort Species::indexes_of_particles_to_exchange built in Species::dynamics - std::vector< std::vector< std::vector > > part_index_send; //! ndim vectors of 2 numbers of particles to send (1 per direction) - std::vector< std::vector< unsigned int > > part_index_send_sz; + std::vector< std::vector< unsigned int > > partSendSize; //! ndim vectors of 2 numbers of particles to receive (1 per direction) - std::vector< std::vector< unsigned int > > part_index_recv_sz; + std::vector< std::vector< unsigned int > > partRecvSize; }; diff --git a/src/Species/Species.cpp b/src/Species/Species.cpp index 37462566f..31ab4c1a5 100755 --- a/src/Species/Species.cpp +++ b/src/Species/Species.cpp @@ -378,11 +378,8 @@ void Species::initOperators( Params ¶ms, Patch *patch ) partBoundCond = new PartBoundCond( params, this, patch ); for( unsigned int iDim=0 ; iDim < nDim_field ; iDim++ ) { for( unsigned int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++ ) { - MPI_buffer_.partRecv[iDim][iNeighbor].initialize( 0, ( *particles ) ); - MPI_buffer_.partSend[iDim][iNeighbor].initialize( 0, ( *particles ) ); - MPI_buffer_.part_index_send[iDim][iNeighbor].resize( 0 ); - MPI_buffer_.part_index_recv_sz[iDim][iNeighbor] = 0; - MPI_buffer_.part_index_send_sz[iDim][iNeighbor] = 0; + MPI_buffer_.partRecv[iDim][iNeighbor]->initialize( 0, ( *particles ) ); + MPI_buffer_.partSend[iDim][iNeighbor]->initialize( 0, ( *particles ) ); } } typePartSend.resize( nDim_field*2, MPI_DATATYPE_NULL ); @@ -1774,10 +1771,10 @@ void Species::sortParticles( Params ¶ms ) // Merge all MPI_buffer_.partRecv in particles_to_move for( int idim = 0; idim < params.nDim_field; idim++ ) { for( int iNeighbor = 0; iNeighbor < 2; iNeighbor++ ) { - int n_part_recv = MPI_buffer_.part_index_recv_sz[idim][iNeighbor]; - if( ( n_part_recv != 0 ) ) { + int n_part_recv = MPI_buffer_.partRecv[idim][iNeighbor]->size(); + if( n_part_recv != 0 ) { // insert n_part_recv in particles_to_move from 0 - MPI_buffer_.partRecv[idim][iNeighbor].copyParticles( 0, + MPI_buffer_.partRecv[idim][iNeighbor]->copyParticles( 0, n_part_recv, *particles_to_move, particles_to_move->size() ); @@ -1809,10 +1806,10 @@ void Species::sortParticles( Params ¶ms ) //Merge all MPI_buffer_.partRecv in particles_to_move // for( int idim = 0; idim < ndim; idim++ ) { // for( int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++ ) { - // int n_part_recv = MPI_buffer_.part_index_recv_sz[idim][iNeighbor]; + // int n_part_recv = MPI_buffer_.partRecv[idim][iNeighbor]->size(); // if( ( n_part_recv!=0 ) ) { // // insert n_part_recv in particles_to_move from 0 - // //MPI_buffer_.partRecv[idim][iNeighbor].copyParticles( 0, n_part_recv, *particles_to_move, 0 ); + // //MPI_buffer_.partRecv[idim][iNeighbor]->copyParticles( 0, n_part_recv, *particles_to_move, 0 ); // total_number_part_recv += n_part_recv; // //particles->last_index[particles->last_index.size()-1] += n_part_recv; // //particles->cell_keys.resize(particles->cell_keys.size()+n_part_recv); @@ -1825,7 +1822,7 @@ void Species::sortParticles( Params ¶ms ) // Sort to adapt do cell_keys usage std::vector indexes_of_particles_to_exchange; for ( int ipart=0 ; ipart< (int)(getNbrOfParticles()) ; ipart++ ) { - if ( particles->cell_keys[ipart] == -1 ) { + if ( particles->cell_keys[ipart] < 0 ) { indexes_of_particles_to_exchange.push_back( ipart ); } } @@ -1900,15 +1897,15 @@ void Species::sortParticles( Params ¶ms ) //Evaluation of the necessary shift of all bins.2 //idim=0 - shift[1] += MPI_buffer_.part_index_recv_sz[0][0];//Particles coming from xmin all go to bin 0 and shift all the other bins. - shift[particles->last_index.size()] += MPI_buffer_.part_index_recv_sz[0][1];//Used only to count the total number of particles arrived. + shift[1] += MPI_buffer_.partRecv[0][0]->size();//Particles coming from xmin all go to bin 0 and shift all the other bins. + shift[particles->last_index.size()] += MPI_buffer_.partRecv[0][1]->size();//Used only to count the total number of particles arrived. //idim>0 for( idim = 1; idim < ndim; idim++ ) { for( int iNeighbor=0 ; iNeighborsize(); for( unsigned int j=0; j<( unsigned int )n_part_recv ; j++ ) { //We first evaluate how many particles arrive in each bin. - ii = int( ( MPI_buffer_.partRecv[idim][iNeighbor].position( 0, j )-min_loc )/dbin ); //bin in which the particle goes. + ii = int( ( MPI_buffer_.partRecv[idim][iNeighbor]->position( 0, j )-min_loc )/dbin ); //bin in which the particle goes. shift[ii+1]++; // It makes the next bins shift. } } @@ -1943,11 +1940,11 @@ void Species::sortParticles( Params ¶ms ) //Space has been made now to write the arriving particles into the correct bins //idim == 0 is the easy case, when particles arrive either in first or last bin. for( int iNeighbor=0 ; iNeighborsize(); //if ( (neighbor_[0][iNeighbor]!=MPI_PROC_NULL) && (n_part_recv!=0) ) { if( ( n_part_recv!=0 ) ) { ii = iNeighbor*( particles->last_index.size()-1 ); //0 if iNeighbor=0(particles coming from Xmin) and particles->last_index.size()-1 otherwise. - MPI_buffer_.partRecv[0][iNeighbor].overwriteParticle( 0, *particles, particles->last_index[ii], n_part_recv ); + MPI_buffer_.partRecv[0][iNeighbor]->overwriteParticle( 0, *particles, particles->last_index[ii], n_part_recv ); particles->last_index[ii] += n_part_recv ; } } @@ -1955,12 +1952,12 @@ void Species::sortParticles( Params ¶ms ) for( idim = 1; idim < ndim; idim++ ) { //if (idim!=iDim) continue; for( int iNeighbor=0 ; iNeighborsize(); //if ( (neighbor_[idim][iNeighbor]!=MPI_PROC_NULL) && (n_part_recv!=0) ) { if( ( n_part_recv!=0 ) ) { for( unsigned int j=0; j<( unsigned int )n_part_recv; j++ ) { - ii = int( ( MPI_buffer_.partRecv[idim][iNeighbor].position( 0, j )-min_loc )/dbin ); //bin in which the particle goes. - MPI_buffer_.partRecv[idim][iNeighbor].overwriteParticle( j, *particles, particles->last_index[ii] ); + ii = int( ( MPI_buffer_.partRecv[idim][iNeighbor]->position( 0, j )-min_loc )/dbin ); //bin in which the particle goes. + MPI_buffer_.partRecv[idim][iNeighbor]->overwriteParticle( j, *particles, particles->last_index[ii] ); particles->last_index[ii] ++ ; } } diff --git a/src/Species/SpeciesV.cpp b/src/Species/SpeciesV.cpp index 98d5d9dbb..89d12b340 100755 --- a/src/Species/SpeciesV.cpp +++ b/src/Species/SpeciesV.cpp @@ -518,7 +518,7 @@ void SpeciesV::dynamics( double time_dual, unsigned int ispec, nrj_lost_per_thd[tid] += mass_ * energy_lost; // for( iPart=particles->first_index[ipack*packsize_+scell] ; iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= length_[i]; @@ -552,7 +552,7 @@ void SpeciesV::dynamics( double time_dual, unsigned int ispec, // if( mass_>0 ) { // for( iPart=particles->first_index[ipack*packsize_+scell] ; iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -564,7 +564,7 @@ void SpeciesV::dynamics( double time_dual, unsigned int ispec, // } // for( iPart=particles->first_index[ipack*packsize_+scell] ; iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -1053,7 +1053,7 @@ void SpeciesV::dynamicsTasks( double time_dual, unsigned int ispec, if( mass_>0 ) { for( int scell = first_cell_of_bin[ibin] ; scell <= last_cell_of_bin[ibin] ; scell++ ) { for( int iPart=particles->first_index[ipack*packsize_+scell] ; ( int )iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -1067,7 +1067,7 @@ void SpeciesV::dynamicsTasks( double time_dual, unsigned int ispec, } else if( mass_==0 ) { for( int scell = first_cell_of_bin[ibin] ; scell <= last_cell_of_bin[ibin] ; scell++ ) { for( int iPart=particles->first_index[scell] ; ( int )iPartlast_index[scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles for( unsigned int i = 0 ; icell_keys[iPart] *= length[i]; @@ -1366,27 +1366,27 @@ void SpeciesV::sortParticles( Params ¶ms ) //Loop over just arrived particles to compute their cell keys and contribution to count for( unsigned int idim=0; idim < nDim_field ; idim++ ) { for( unsigned int ineighbor=0 ; ineighbor < 2 ; ineighbor++ ) { - buf_cell_keys[idim][ineighbor].resize( MPI_buffer_.part_index_recv_sz[idim][ineighbor] ); + buf_cell_keys[idim][ineighbor].resize( MPI_buffer_.partRecv[idim][ineighbor]->size() ); // #pragma omp simd - // for( unsigned int ip=0; ip < MPI_buffer_.part_index_recv_sz[idim][ineighbor]; ip++ ) { + // for( unsigned int ip=0; ip < MPI_buffer_.partRecv[idim][ineighbor]->size(); ip++ ) { // for( unsigned int ipos=0; ipos < nDim_field ; ipos++ ) { - // double X = ((this)->*(distance[ipos]))(&MPI_buffer_.partRecv[idim][ineighbor], ipos, ip); + // double X = ((this)->*(distance[ipos]))(MPI_buffer_.partRecv[idim][ineighbor], ipos, ip); // int IX = round( X * dx_inv_[ipos] ); // buf_cell_keys[idim][ineighbor][ip] = buf_cell_keys[idim][ineighbor][ip] * length_[ipos] + IX; // } // } // // not vectorizable because random access to count - // for( unsigned int ip=0; ip < MPI_buffer_.part_index_recv_sz[idim][ineighbor]; ip++ ) { + // for( unsigned int ip=0; ip < MPI_buffer_.partRecv[idim][ineighbor]->size(); ip++ ) { // count[buf_cell_keys[idim][ineighbor][ip]] ++; // } computeParticleCellKeys( params, - &MPI_buffer_.partRecv[idim][ineighbor], + MPI_buffer_.partRecv[idim][ineighbor], &buf_cell_keys[idim][ineighbor][0], &count[0], 0, - MPI_buffer_.part_index_recv_sz[idim][ineighbor] ); + MPI_buffer_.partRecv[idim][ineighbor]->size() ); } } @@ -1403,8 +1403,8 @@ void SpeciesV::sortParticles( Params ¶ms ) //Now proceed to the cycle sort - if( MPI_buffer_.partRecv[0][0].size() == 0 ) { - MPI_buffer_.partRecv[0][0].initialize( 0, *particles ); //Is this correct ? + if( MPI_buffer_.partRecv[0][0]->size() == 0 ) { + MPI_buffer_.partRecv[0][0]->initialize( 0, *particles ); //Is this correct ? } // Resize the particle vector @@ -1418,7 +1418,7 @@ void SpeciesV::sortParticles( Params ¶ms ) //Copy all particles from MPI buffers back to the writable particles via cycle sort pass. for( unsigned int idim=0; idim < nDim_field ; idim++ ) { for( unsigned int ineighbor=0 ; ineighbor < 2 ; ineighbor++ ) { - for( unsigned int ip=0; ip < MPI_buffer_.part_index_recv_sz[idim][ineighbor]; ip++ ) { + for( unsigned int ip=0; ip < MPI_buffer_.partRecv[idim][ineighbor]->size(); ip++ ) { cycle.resize( 1 ); cell_target = buf_cell_keys[idim][ineighbor][ip]; ip_dest = particles->first_index[cell_target]; @@ -1429,7 +1429,7 @@ void SpeciesV::sortParticles( Params ¶ms ) cycle[0] = ip_dest; cell_target = particles->cell_keys[ip_dest]; //As long as the particle is not erased, we can build up the cycle. - while( cell_target != -1 ) { + while( cell_target >= 0 ) { ip_dest = particles->first_index[cell_target]; while( particles->cell_keys[ip_dest] == cell_target ) { ip_dest++; @@ -1441,7 +1441,7 @@ void SpeciesV::sortParticles( Params ¶ms ) //Last target_cell is -1, the particle must be erased: particles->translateParticles( cycle ); //Eventually copy particle from the MPI buffer into the particle vector. - MPI_buffer_.partRecv[idim][ineighbor].overwriteParticle( ip, *particles, cycle[0] ); + MPI_buffer_.partRecv[idim][ineighbor]->overwriteParticle( ip, *particles, cycle[0] ); } } } @@ -1450,14 +1450,14 @@ void SpeciesV::sortParticles( Params ¶ms ) for( unsigned int ip=( unsigned int )particles->last_index.back(); ip < npart; ip++ ) { cell_target = particles->cell_keys[ip]; - if( cell_target == -1 ) { + if( cell_target < 0 ) { continue; } cycle.resize( 0 ); cycle.push_back( ip ); //As long as the particle is not erased, we can build up the cycle. - while( cell_target != -1 ) { + while( cell_target >= 0 ) { ip_dest = particles->first_index[cell_target]; @@ -1533,7 +1533,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, #pragma omp simd for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { //Compute cell_keys particles cell_keys[iPart] = std::round( position_x[iPart] * dx_inv_[0]) - min_loc_l ; cell_keys[iPart] *= length_[1]; @@ -1553,7 +1553,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, #pragma omp simd for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles cell_keys[iPart] = std::round(position_x[iPart] * dx_inv_[0] )- min_loc_x ; cell_keys[iPart] *= length_[1]; @@ -1573,7 +1573,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, #pragma omp simd for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles cell_keys[iPart] = std::round(position_x[iPart] * dx_inv_[0] )- min_loc_x ; cell_keys[iPart] *= length_[1]; @@ -1589,7 +1589,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, #pragma omp simd for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles cell_keys[iPart] = round(position_x[iPart] * dx_inv_[0] )- min_loc_x ; } @@ -1598,7 +1598,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, } for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { count[cell_keys[iPart]] ++; } } @@ -2526,7 +2526,7 @@ void SpeciesV::ponderomotiveUpdatePositionAndCurrentsTasks( double time_dual, un smpi->traceEventIfDiagTracing(diag_PartEventTracing, Tools::getOMPThreadNum(),0,11); for( int iPart=particles->first_index[scell] ; iPartlast_index[scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. for( int i = 0 ; i<( int )nDim_field; i++ ) { particles->cell_keys[iPart] *= length_[i]; diff --git a/src/Species/SpeciesVAdaptive.cpp b/src/Species/SpeciesVAdaptive.cpp index b24d86711..98813c71e 100755 --- a/src/Species/SpeciesVAdaptive.cpp +++ b/src/Species/SpeciesVAdaptive.cpp @@ -275,7 +275,7 @@ void SpeciesVAdaptive::scalarDynamics( double time_dual, unsigned int ispec, // if( mass_>0 ) { // // for( iPart=particles->first_index[scell] ; ( int )iPartlast_index[scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -289,7 +289,7 @@ void SpeciesVAdaptive::scalarDynamics( double time_dual, unsigned int ispec, // } else if( mass_==0 ) { // // for( iPart=particles->first_index[scell] ; ( int )iPartlast_index[scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -754,7 +754,7 @@ void SpeciesVAdaptive::scalarDynamicsTasks( double time_dual, unsigned int ispec if( mass_>0 ) { for( int iPart=particles->first_index[ipack*packsize_+scell] ; ( int )iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -768,7 +768,7 @@ void SpeciesVAdaptive::scalarDynamicsTasks( double time_dual, unsigned int ispec } else if( mass_==0 ) { for( int iPart=particles->first_index[scell] ; ( int )iPartlast_index[scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles for( unsigned int i = 0 ; icell_keys[iPart] *= length[i]; @@ -1662,7 +1662,7 @@ void SpeciesVAdaptive::scalarPonderomotiveUpdatePositionAndCurrentsTasks( double smpi->traceEventIfDiagTracing(diag_PartEventTracing, Tools::getOMPThreadNum(),0,11); for( int iPart=particles->first_index[first_cell_of_bin[ibin]] ; iPartlast_index[last_cell_of_bin[ibin]]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. for( int i = 0 ; i<( int )nDim_field; i++ ) { particles->cell_keys[iPart] *= length_[i]; diff --git a/src/Tools/Timers.cpp b/src/Tools/Timers.cpp index 0cd6dac0c..d3edda0e4 100755 --- a/src/Tools/Timers.cpp +++ b/src/Tools/Timers.cpp @@ -18,7 +18,7 @@ Timers::Timers( SmileiMPI *smpi ) : collisions( "Collisions" ), // Call to Collisions methods movWindow( "Mov window" ), // Moving Window loadBal( "Load balancing" ), // Load balancing - syncPart( "Sync Particles" ), // Call exchangeParticles (MPI & Patch sync) + syncPart( "Sync Particles" ), // Call initExchParticles (MPI & Patch sync) syncField( "Sync Fields" ), // Call sumRhoJ(s), exchangeB (MPI & Patch sync) syncDens( "Sync Densities" ), // If necessary the following timers can be reintroduced particleMerging( "Part Merging" ), // Particle merging From 50891ac16ef744f99d6d240606cfec7be65793d1 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Tue, 2 Apr 2024 17:00:55 +0200 Subject: [PATCH 02/54] repurpose extractParticles --- src/Particles/Particles.cpp | 17 ++++++++----- src/Particles/Particles.h | 2 +- src/Patch/Patch.cpp | 51 ++++++++----------------------------- src/Species/Species.cpp | 10 -------- src/Species/Species.h | 6 ----- 5 files changed, 22 insertions(+), 64 deletions(-) diff --git a/src/Particles/Particles.cpp b/src/Particles/Particles.cpp index aa9b8a02c..d628e24d2 100755 --- a/src/Particles/Particles.cpp +++ b/src/Particles/Particles.cpp @@ -1303,14 +1303,17 @@ void Particles::copyFromDeviceToHost() ERROR( "Device only feature, should not have come here!" ); } -void Particles::extractParticles( Particles* particles_to_move ) +// Loop all particles and copy the outgoing ones to buffers +void Particles::extractParticles( const bool copy[], Particles* buffer[] ) { - particles_to_move->clear(); - // for ( int ipart=0 ; ipart> copy( 3 ); - copy[0] = { neighbor_[0][0] != MPI_PROC_NULL, neighbor_[0][1] != MPI_PROC_NULL }; - copy[1] = { neighbor_[1][0] != MPI_PROC_NULL, neighbor_[1][1] != MPI_PROC_NULL }; - if( params.nDim_field > 2 ) { - copy[2] = { neighbor_[2][0] != MPI_PROC_NULL, neighbor_[2][1] != MPI_PROC_NULL }; + bool copy[params.nDim_field*2]; + Particles* sendBuffer[params.nDim_field*2]; + for( size_t iDim = 0; iDim < params.nDim_field; iDim++ ) { + copy[2*iDim+0] = neighbor_[iDim][0] != MPI_PROC_NULL; + copy[2*iDim+1] = neighbor_[iDim][1] != MPI_PROC_NULL; + sendBuffer[2*iDim+0] = buffer.partSend[iDim][0]; + sendBuffer[2*iDim+1] = buffer.partSend[iDim][1]; } if( params.geometry == "AMcylindrical" ) { - copy[0][0] = copy[0][0] && ( Pcoordinates[0]!=0 || vecSpecies[ispec]->boundary_conditions_[0][0]=="periodic" ); - copy[0][1] = copy[0][1] && ( Pcoordinates[0]!=params.number_of_patches[0]-1 || vecSpecies[ispec]->boundary_conditions_[0][1]=="periodic" ); + copy[0] = copy[0] && ( Pcoordinates[0]!=0 || vecSpecies[ispec]->boundary_conditions_[0][0]=="periodic" ); + copy[1] = copy[1] && ( Pcoordinates[0]!=params.number_of_patches[0]-1 || vecSpecies[ispec]->boundary_conditions_[0][1]=="periodic" ); } - // Loop all particles and count the outgoing ones - for( size_t ipart = 0; ipart < part.size(); ipart++ ) { - if( part.cell_keys[ipart] < -1 ) { - if( part.cell_keys[ipart] == -2 ) { - if( copy[0][0] ) { - part.copyParticle( ipart, *buffer.partSend[0][0] ); - } - } else if( part.cell_keys[ipart] == -3 ) { - if( copy[0][1] ) { - part.copyParticle( ipart, *buffer.partSend[0][1] ); - } - } else if( part.cell_keys[ipart] == -4 ) { - if( copy[1][0] ) { - part.copyParticle( ipart, *buffer.partSend[1][0] ); - } - } else if( part.cell_keys[ipart] == -5 ) { - if( copy[1][1] ) { - part.copyParticle( ipart, *buffer.partSend[1][1] ); - } - } else if( part.cell_keys[ipart] == -6 ) { - if( copy[2][0] ) { - part.copyParticle( ipart, *buffer.partSend[2][0] ); - } - } else if( part.cell_keys[ipart] == -7 ) { - if( copy[2][1] ) { - part.copyParticle( ipart, *buffer.partSend[2][1] ); - } - } - } - } - + part.extractParticles( copy, sendBuffer ); + } // copyExchParticlesToBuffers(... iDim) @@ -706,7 +679,6 @@ void Patch::exchParticles( SmileiMPI *smpi, int ispec, Params &, int iDim, Vecto for( int iNeighbor=0; iNeighborsize()<<" n_recv "<size()); // Send Particles &partSend = *buffer.partSend[iDim][iNeighbor]; if( partSend.size() != 0 && is_a_MPI_neighbor( iDim, iNeighbor ) ) { @@ -720,7 +692,6 @@ void Patch::exchParticles( SmileiMPI *smpi, int ispec, Params &, int iDim, Vecto int iOppositeNeighbor = ( iNeighbor+1 )%2; Particles &partRecv = *buffer.partRecv[iDim][iOppositeNeighbor]; if( partRecv.size() != 0 && is_a_MPI_neighbor( iDim, iOppositeNeighbor ) ) { - // MESSAGE(" patch "<typePartRecv[( iDim*2 )+iNeighbor] = smpi->createMPIparticles( &partRecv ); int local_hindex = neighbor_[iDim][iOppositeNeighbor] - smpi->patch_refHindexes[ MPI_neighbor_[iDim][iOppositeNeighbor] ]; int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); diff --git a/src/Species/Species.cpp b/src/Species/Species.cpp index 31ab4c1a5..0fb38f673 100755 --- a/src/Species/Species.cpp +++ b/src/Species/Species.cpp @@ -1744,16 +1744,6 @@ void Species::computeCharge( ElectroMagn *EMfields, bool old /*=false*/ ) }//END computeCharge -void Species::extractParticles() -{ - particles->extractParticles( particles_to_move ); -} - -// void Species::injectParticles( Params ¶ms ) -// { -// } - - // --------------------------------------------------------------------------------------------------------------------- //! Sort particles // --------------------------------------------------------------------------------------------------------------------- diff --git a/src/Species/Species.h b/src/Species/Species.h index 56c693d65..b91c9521b 100755 --- a/src/Species/Species.h +++ b/src/Species/Species.h @@ -482,12 +482,6 @@ class Species //! Method calculating the Particle charge on the grid (projection) virtual void computeCharge( ElectroMagn *EMfields, bool old=false ); - //! Method used to select particles which will change of patches - virtual void extractParticles(); - - //! Method used to integrate particles which come from another patches - // virtual void injectParticles( Params ¶ms ); - //! Method used to inject and sort particles virtual void sortParticles( Params ¶m ); From da1b17248adcb1ccb7171f8f31ad7b4faf853f53 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Tue, 2 Apr 2024 18:30:02 +0200 Subject: [PATCH 03/54] CI on particle_exchange --- .gitlab-ci.yml | 8 ++++++++ src/Particles/Particles.cpp | 2 +- src/Particles/Particles.h | 7 +++---- src/Patch/Patch.cpp | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6faa6ff17..f50bfd819 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,6 +15,7 @@ install: stage: install only: - develop + - particle_exchange script: # Force workdir cleaning in case of retried @@ -33,6 +34,7 @@ compile_default: stage: compile_default only: - develop + - particle_exchange script: # Move in test dir @@ -44,6 +46,7 @@ runQuick: stage: run_quick only: - develop + - particle_exchange script: # Move in test dir @@ -55,6 +58,7 @@ run1D: stage: run_default only: - develop + - particle_exchange script: # Move in test dir @@ -67,6 +71,7 @@ run2D: stage: run_default only: - develop + - particle_exchange script: # Move in test dir @@ -81,6 +86,7 @@ run3D: stage: run_default only: - develop + - particle_exchange script: # Move in test dir @@ -96,6 +102,7 @@ runAM: stage: run_default only: - develop + - particle_exchange script: # Move in test dir @@ -108,6 +115,7 @@ runCollisions: stage: run_default only: - develop + - particle_exchange script: # Move in test dir diff --git a/src/Particles/Particles.cpp b/src/Particles/Particles.cpp index d628e24d2..62f8b67af 100755 --- a/src/Particles/Particles.cpp +++ b/src/Particles/Particles.cpp @@ -1304,7 +1304,7 @@ void Particles::copyFromDeviceToHost() } // Loop all particles and copy the outgoing ones to buffers -void Particles::extractParticles( const bool copy[], Particles* buffer[] ) +void Particles::extractParticles( const size_t /* ndim */, const bool copy[], Particles* buffer[] ) { for( size_t ipart = 0; ipart < size(); ipart++ ) { if( cell_keys[ipart] < -1 ) { diff --git a/src/Particles/Particles.h b/src/Particles/Particles.h index aa7fbbe9f..a155baf7a 100755 --- a/src/Particles/Particles.h +++ b/src/Particles/Particles.h @@ -473,10 +473,9 @@ class Particles // Accelerator specific virtual functions // ----------------------------------------------------------------------------- - //! Extract particles from the Particles object and put - //! them in the Particles object `particles_to_move` + //! Extract particles escaping the box to buffers // ----------------------------------------------------------------------------- - virtual void extractParticles( const bool copy[], Particles* buffer[] ); + virtual void extractParticles( const size_t ndim, const bool copy[], Particles* buffer[] ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device @@ -484,7 +483,7 @@ class Particles virtual int eraseLeavingParticles(); // ----------------------------------------------------------------------------- - //! Inject particles from particles_to_move object and put + //! Inject particles from particles_to_inject object and put //! them in the Particles object //! \param[in,out] particles_to_inject Particles object containing particles to inject virtual int injectParticles( Particles *particles_to_inject ); diff --git a/src/Patch/Patch.cpp b/src/Patch/Patch.cpp index c7e3ebd78..f0bb6a1fb 100755 --- a/src/Patch/Patch.cpp +++ b/src/Patch/Patch.cpp @@ -552,7 +552,7 @@ void Patch::copyExchParticlesToBuffers( int ispec, Params ¶ms ) copy[1] = copy[1] && ( Pcoordinates[0]!=params.number_of_patches[0]-1 || vecSpecies[ispec]->boundary_conditions_[0][1]=="periodic" ); } - part.extractParticles( copy, sendBuffer ); + part.extractParticles( params.nDim_field, copy, sendBuffer ); } // copyExchParticlesToBuffers(... iDim) From 593e96c545fa40d8811673639c087f3af3585e2a Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Wed, 3 Apr 2024 00:36:25 +0200 Subject: [PATCH 04/54] Fix in the new copyParticles --- src/Particles/Particles.cpp | 7 ++++--- src/Patch/Patch.cpp | 11 ++++------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/Particles/Particles.cpp b/src/Particles/Particles.cpp index 62f8b67af..b675ac12f 100755 --- a/src/Particles/Particles.cpp +++ b/src/Particles/Particles.cpp @@ -422,12 +422,13 @@ void Particles::copyParticles( vector indices, Particles &dest_parts, in { const size_t transfer_size = indices.size(); const size_t dest_new_size = dest_parts.size() + transfer_size; + const size_t displaced_size = dest_parts.size() - dest_id; for( unsigned int iprop=0 ; ipropresize( dest_new_size ); auto loc = dest_parts.double_prop_[iprop]->begin() + dest_id; - move_backward( loc, loc + transfer_size, dest_parts.double_prop_[iprop]->end() ); + move_backward( loc, loc + displaced_size, dest_parts.double_prop_[iprop]->end() ); // Copy data for( size_t i = 0; i < transfer_size; i++ ) { ( *dest_parts.double_prop_[iprop] )[dest_id+i] = ( *double_prop_[iprop] )[indices[i]]; @@ -438,7 +439,7 @@ void Particles::copyParticles( vector indices, Particles &dest_parts, in // Make space in dest array dest_parts.short_prop_[iprop]->resize( dest_new_size ); auto loc = dest_parts.short_prop_[iprop]->begin() + dest_id; - move_backward( loc, loc + transfer_size, dest_parts.short_prop_[iprop]->end() ); + move_backward( loc, loc + displaced_size, dest_parts.short_prop_[iprop]->end() ); // Copy data for( size_t i = 0; i < transfer_size; i++ ) { ( *dest_parts.short_prop_[iprop] )[dest_id+i] = ( *short_prop_[iprop] )[indices[i]]; @@ -449,7 +450,7 @@ void Particles::copyParticles( vector indices, Particles &dest_parts, in // Make space in dest array dest_parts.uint64_prop_[iprop]->resize( dest_new_size ); auto loc = dest_parts.uint64_prop_[iprop]->begin() + dest_id; - move_backward( loc, loc + transfer_size, dest_parts.uint64_prop_[iprop]->end() ); + move_backward( loc, loc + displaced_size, dest_parts.uint64_prop_[iprop]->end() ); // Copy data for( size_t i = 0; i < transfer_size; i++ ) { ( *dest_parts.uint64_prop_[iprop] )[dest_id+i] = ( *uint64_prop_[iprop] )[indices[i]]; diff --git a/src/Patch/Patch.cpp b/src/Patch/Patch.cpp index f0bb6a1fb..546e0ca08 100755 --- a/src/Patch/Patch.cpp +++ b/src/Patch/Patch.cpp @@ -634,8 +634,8 @@ void Patch::prepareParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iD Particles &partSend = *buffer.partSend[iDim][iNeighbor]; // Enabled periodicity - if( neighbor_[iDim][iNeighbor] != MPI_PROC_NULL && partSend.size() != 0 ) { - if( smpi->periods_[iDim]==1 ) { + if( neighbor_[iDim][iNeighbor] != MPI_PROC_NULL ) { + if( partSend.size() > 0 && smpi->periods_[iDim]==1 ) { if( iNeighbor == 0 && Pcoordinates[iDim] == 0 ) { for( size_t iPart=0; iPart < partSend.size(); iPart++ ) { if( partSend.position( iDim, iPart ) < 0. ) { @@ -651,17 +651,14 @@ void Patch::prepareParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iD } } } - } - - if( neighbor_[iDim][iNeighbor] != MPI_PROC_NULL ) { + // Initialize receive buffer with the appropriate size if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { if( buffer.partRecvSize[iDim][iNeighbor]!=0 ) { buffer.partRecv[iDim][iNeighbor]->initialize( buffer.partRecvSize[iDim][iNeighbor], *vecSpecies[ispec]->particles ); } - } // Swap particles to other patch directly if it belongs to the same MPI - else { + } else { int iOppositeNeighbor = ( iNeighbor+1 )%2; SpeciesMPIbuffers &neighbor_buffer = ( *vecPatch )( neighbor_[iDim][iNeighbor]- vecPatch->refHindex_ )->vecSpecies[ispec]->MPI_buffer_; swap( buffer.partSend[iDim][iNeighbor], neighbor_buffer.partRecv[iDim][iOppositeNeighbor] ); From f5659256bf7b8e0cf6371611c1caa7c6958b9a2b Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Wed, 3 Apr 2024 11:11:15 +0200 Subject: [PATCH 05/54] new reference --- .../references/tst2d_04_laser_wake.py.txt | Bin 51895 -> 51895 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/validation/references/tst2d_04_laser_wake.py.txt b/validation/references/tst2d_04_laser_wake.py.txt index 48d9eaecad05f679f5c1af836be621ca3b01a1e9..094e7c366dca0e242cf0f955e47a7c88c3b3a09d 100755 GIT binary patch delta 23828 zcmWJscRZC16i%hAlAXO}ME2&`GbLL_#{TR3DrA(lD4&W# zWi>QZ-~H?U?)}}rp7)&doacGYd%bAW^`cD$t;CI9wYPf?O2PU*qq-wxA?Vgn=2hmW ze`n~?-gZb8OeP|3`fM_S#y-_f&D8zK$GPvBTp0;#zfZhhWRJwJ9>QVMDzW(X+SU}d z(PVV8wzs=moQ!XTsxA9JCSvD8^>0dAJU*U?lJ?__MLDfr`G~}56uEWpK2Amu=+3q9 z^7N@-Y-21K`pfH&qj6lPY->Dlxv0Q*_g{PLA3s+9NmvK{+LJfDAk<;|i{0`gLiRA_ z@Xfa&>LB>`?~q?A_XFLyo70y!1q1s`io>NN5#XY}zudz$8XP>5UF9Ffz^gZ@4=lXn zVdzA2>Gp|4n0Yv`Vh}?lgZpP`(J!x(;d(n;MY~QcY{71eX|YHsDQGP%sUShV^Iz9b zTJ~^ekd_l|Z-f#;m#pr~t74M@<7ov(DXdZ1KRkY15IrNc&%ga<4ezCs=6UqZAc@~| z55pr<_@63mD3p>jop%vv0Zd^~9?!jp=& z3!U;U^O=|uC_7FXq2YJk|E5g~oQWLiuXT%JF@#IP0)OPspePfw8{PTi-fy(%7jGu;htHBrE{Y>F&!o(eTS%#}Lo z>EJVBUFXA*36)>!7Vh@Yz|$(AdU!1p7ANC2hs&nI$v&r3o-FZ@@MZa%0mos8Q2*sn zQ0@tAO3SrOI_g;Mv6&Dv6*a|lf5C5WUYp|K$S7Yce>0rw9ppW*)dnQi3RvIOo58x$ z82jB)Gl;V|HtDl!3g<%q{`bt@0A9>xoOQqHf^nB6k8b@Dfqj!sKdryVVei>`DZZ9e zTw(U9+7LuT=Piu4IG)fjfiXd1gf|l}^n05o8Kq-sJ8_NeO#>A>C(9>EtrXmoky*!m zEgpRi%jIR=^gQ88hDRiKePHM6_Q0C4}WEf2Vu9{As1vL zL61wRw&Je~*ee)Hj_Md-j?$;6maC>{qM0G;Oqk(-ee$der#TMPRSTBAv4&y`ug|~L z%%SVdE}=pNbI^C$Fzs{83<%-te?LBbYYffD>&{ir+T+spTh0N);i&idTuI7o9P0Ek z{a_5HV*S-WtNUNl@b|NolP@P}Xv$+#Co7bR8@(FJH&~|Q^Ztys=58uRDdzoa9-&}J zSkw=dfp{DqPYa#6Oh%!5w+}cd(I1hc_|?!U66RA{-fW$ABk*NyX4O)c5k^JbJZd9o z4CXZt!>pFvK#VQK%4nPf(dVCi;}DAg?*F`uR%^&Gd0a+(%Sb%OpVSeSXA&iku(+ z@!v!^Fzopv{$1e!c)ILZJ=0>07l-ELemyZm^NFXIgFczz22x|XT)#QKiQ?}uylf4+ zL}1RLYvxe>uwb`tjydqa!?VsM=3tg^A#Q2d6!;u=GWQi*;E%v}4ApL7s4BZvR(v!L z;X{5}z7m1(>2={Boht`@t8|l${^rIC~dVh(@BS3mBze5u}p~Zk3RZ(oCZD#LVbq~vf#uD=iGn|0aI684kS#*!GA># zF{MYtz*mPP8KGneS4!OyPkb^($-Gvl-)-h-_Q}%sMzT4IKOcUdYHNWH)qSIP>RQ8r zkW&PY>%2LXRZ+B^_$^?|1$|c=3kz_RHuOv~HHW{iI*mW>(!qsGrgfjrgrEe^a?)%= z9Exp;zdZhtK(!2=hUu&J9aTEG#g8L;v|F$rdPL!MB@P z!x~EcyUXwXGKWU@EJ>D27VzmiPp9Qm3#hyuy}nJ-0{BX(UvJ&v$1_TvQmW}e$f`E5 z<8M|R62mvh>=kxt$RE}Z(QH~6@9FU%f$HB;e;+1kT_Gla0n$jmF&mTBH`;3H6k)?Rq#2uaZKZ`DHF~(P< zSU1iQWAL4|;CH6DL-3LY_pLu90#=IFE|NJTKw7RWdht3LvJVKD^i9XZ?*WAYVu1o5 z)|070qg3!~uV1-ml@4)9#b&Q|XTo*Wkpl7@4FVEBe)Q?f0?~b3Q|AKGpk#St!ljFG z5Pk6Drks00P$ehJQTagt>ZFWa?YJzkHS~ud=d1-%`FVD2`H!%`>3y3|eLru3EYb{} z*~?bod9;0MF2w?FJ`vph`?dv4D8xK5@v#JEIU3odzyfsNa1;kRNkLF&(!zVu03_Wy zlv-s=hw@#}t-G~pC~wDWHL{Y0Q#q9e^)oco&JdLu)5t`g23GB0I+95{p0+0rP_b$G z=;StHmV&N9OqO5A;<2gHx8pz=84K?{i@x$J9HHu=rd2-)#R?R2HI&^^J%Xlxw8aQV zY~ILlY%qbwJb_J~749$_ENe2!699QW%wN9miU3zm;Z~_`GW?t5@htfg5AW<~8y_xF zK=|~))klx1;7}z(HFZpf%^&@W2qVo*P!g&?IR23a7ul6L%U7~sXVQ4Z>OdN}`x&jh z3X6lr@{I=%ga$xdAm@cJ6-m?^c2pDzx4`+$hm{tCEb*K=Z{b#1OBDFcpzig~0<~g0 zVItBBHav8=*FR_h4|^UR&wFJ7lpYhls_T{@6aI3%^``|q-Z4m&%>L5^bseF~*l0gg zV}H9raeEvprFQAV3`I)4Oj3&oifacK&dK$Yt3_T0bTFDZp_QbT329eDA6wFqlzerpp`0}vycQG&H%X<# z1*ewUHHA1>rU+c-Z1#hu3a73vS54ecyvUgITeiUK>oq5`otcv!>B!tm z;;8sc#jY0vDu>r7=py~ay81^v#`-th4SGq&qOt=s$I>J4Rjf_c;_d(}Kd8x5GwY6L zL!|c4$e3W0WMPZ4ixJfONsDq9xItBEasG!Y63D#uYnIs<0eO2&?K(=xu-E_Omb0Vr zP%d2B8-JHV0Qcqs@r*7i+aZfk%v1IGw@rzOdO6!~VL6 zW%gMhvlhcw*F%~nMAom5`(4v< z`(phJ<69~&AG;IFGeg1t!!DuH@8WTxyWF9*pN!qLohBnz5y)upE~bh2OhTffSnUqI zzt0AkpAwih!VA?I1J9+6Ved*()XrHqh~WHr+hLLf)3lJjYU>DC9<_=O=p=)H7W;+U z3-RzKd5D?(f&xMti)h++sSqXWeR{zy9bPpjT&vrY35^5HsuNQ*5S6?3Z<}E@Y{;#W z>F7;|Au-Yo!jT*coHrw@mk=YDHOzt9A`UobibezC+g$ktVnw!*y1FKt`CTjJTD zrni=Y=;Xs@(Wk%367HW*xvHpb0R>UZzITc&;htLHfdvtI-xq$j1%$GLh5ccOdV9 zbS%CUop`vB~2PO8^Levg@!;iGZCO2|h(M41Q#Z zr!9CQcj{Q+MS{gH&)U@j*erG^IXYUvo4->#(!wp_xmAN^yoD7!%{@`!)Mp7f%egP6 zge{;y;ZKfKtPSq0TOW-3L`Ff~Kc5<8vGAPAaU+GnW`26R>OfOnA^y3 zpGjrL^D$j!|2jXOGDm@-nV7I;dQtv0e5`W9Ass@GxcGgL&4j4FzJwo-X#_+{PwrFR zn+MQ&I%o9j_(QCW-I*p zF0jmg$pTNMb&5$iSzvuh<9OLsGq}EHC$XKba14T;ZX`4FU16^Z#C}Afvw%=#%P%OK$-EO zXeORKd&8lXUX()nwTb@Ibfq&VCH*|}4F#9~tFb&Z8;?`=JMZ*eAmboQg6s*I2rQYN z;J06=FO^nySsDiJIA5OXA(do|Rc0xks#}cVM%k$YY&q_r^{+Hj=MxDc&3s-I{0WDS zN;6;cMlxN72ey6bi-()qi!|Mr6tKQ+@*#MD3dc%_ReN94bm*^Sz5al%bUH3Du@@l? z`14;s@iL?H_he@lKP3a~h5~PqxT3*$-uB`lRYhP~^&2XFXpV`^E&{_!R>-Dj+f(9W zg%Qv9W$(Odfw`aMYwmq9$0g?<3Ge-I=}v=JY>klSNthqnSWfFYl+l@?rG#iB~WU%D{(>UX>Z@*{G>kv8Dbt4WCUt z_k2ZnGE{PfmUT?iF)sSrb%Ehd%wb5Wi1^gL_a$NYbfotxE!`Y{x4O4DeY3=N zJua#AVN0x!o8El8(gL?fXZ{%nb5vs$+tg@b3bQ?jILw&LV091GzxJ2~v^O15J58fQ znbYz;*U1vrUvGZDcgze5t_K|YqAv$OALc5UZwg15+vl{8(2*2Rmbot~n~j%y{$@>H zCuo@W^~j}Bu1su5%s-)Ql8z_d%g&9Rr{YUVhjxkA6x_RakxTn!JdVFUVfCbfjLYx8 zi_M!x;J_f;nYLLHCS3c;|pmcoH`ML27`FzbXbAc%Ph>9 zP_cTxqw^+Re#^5hI&f!$Uq-Ldb2^lA0eoWTox`E}*3(p8E-4hK94h1sHpg^lhf@*B zmRRZCrnSS)5-J3Q4&S^i3koL3z{u z*S8`qfoGygMIzb)JcL9xuhpBuRtNsIS4UJ~BG85R{Iy`b`!Q0>=0FBs_R~F+#+8j< zereuOsimRCF`c9F+cME4n60~xj%4oTrVVz*R6H;Gqa$sSg8ypIdeUCUW1G}Jg}Od6 zom0AqyZa7CAYXODxxsHF{LL=)PW*y9b_jBqYCJT?&afk#(mw&x77|<@_`88&i?7OP zHVOE-%52&P!eOHCVR=dc87l4vix_vu!^bEG_g%%cq0AXjet& zY6iqlUC&ysqrpJslW4v#S)i0k7>IPnWB{cqYs`l`1nPC#`mD!Q@aWnlR`*piYz%mQ znkHe1{|Q?Tez;Al~r%z(#UhvO%^CbTWuHyqp(fMJ9$cdR{s1~!%q>+D;j zn?lOo_RV=Tq?W3jEZC5VP1Mr&uNBg9^6b^h!bB+vLABns9@eU8%C| zuSh_A2t1QKC=dagmhKuqDJR3(u!{j(@5TdZSEQ==B?=UHh&%{Nph9xQ+(`|3QQlJz zR99Tg01vf|Z#SK%fpv+=uZI7!pl4P*kJlgrbos8PnT!R%{d9(uoo_XfX*_?5pU(_A z1;Y)RUs&KEn~}zc59ZiyGE3c7WKQ6f%md+H-k4%w#_{`u-08TdD6qvUiHajVCY~P02?`#vzs+&&NIah6 ze(ZMR4Bhi}6yVZq3&$wGr#`dCNqD_QVxXhf4VjO52Hm-3h;0g1SHvzFgYtxKCI5eJ zuq7Zn<@yo{3{Fjme36O(snk{v)$?SC72}MS!g$CwbR7~Zqd=0~=3lPiRLC5zh%}%V z<=axB9~X()447At9VSv~@ZUN8;0uFUkoQ=1-t%KRT&+6p!gtCK3K;cSbAxrT(J9lK zP&38I8bijLjuv=~o29(a-yHA%cC*R-X@*DocSZSqG{L}mBaTQZBVaphb(~yA*Abfc zj&WTy1zr2z2eH)d?Ft*C#)H4&yQgO(DKL6PjJep53XQTJ zLqq)OFy8P`q6@$oN#3OtA^nBnLfxTA%?bX@*RUx~^D$$2G6> zzchTauR%aMJPZ5uFK`?C(r{DYS_AAgIbVlZFE*CXB(OyBoY zVBtbMK9wDmFwZ8V^3f!j7yaQ_bFWGD@hcKKU;glc$HN_|LuRkzNya$CKQp{~f!;k| zhr%E9yTQAKEt|<1B#`~H;eK>$I9!q%Dot}BgHS5#?G?Xx*zMl^HH3~Nv8UjqMSvI; zA_tqVo3p0FFQ&&H&#z~|_e80{1KKp$Oq>ekOUnYytjxhJpVNS&;7FaipBt3U3n*Mv z*TVlSyp23ho1nJE>W)}JGjz{>BV(apj>WeUoUJ)cG08d1MaJG3%M$LNGnmka%hJpt zaFmumj5X z?NzhmPeU$V)^Z-xEPQStu(QyBPT#^CcK#~Nz}ozlaqgdK7~gL5uThqsj-5Dq;VMvY zb%sYrBq|DTymX;E zWZO~s6+?I_-ub2_-593CetsJ~W(@mzC{$%OQ;76*_?*pd1m5ynOKeNkAyfS%sldPv z|8jd;e|4u~XD<*KBuA8;Cr9NjZ~CSpct*# zQqb}8+{N&Sczm;KW7q`~GL9wBtR)PEW9$b7uDVMk4EX5b*%0Z5)m#lvHXJub)+XiA zM>HeYX2Z(MWaAE_{zDSiyh(7+Lm#EHF!V(3m~u9^ORRl5AnTVuOxLxE2x zerj)1rozu%I+{D^M&wZWnO#w5=#z5W?ur|KGvSEEY8OQ$3r>p$x~x2*!rYq^Q8Olc zVbozLx$WH^>?l%wd`;UJhnFRDjP{w}?4Hm^JKaoCVd=F(t%xzEIRE&T;S5L;BIcM2 zy0k#1yJ2=g!vKhD*N7o8x{a2mKlE~!F*sLFt^CY3hDcUzyZR3X(8l)qbcnVpNc4QJ z9MHAI?OJTZ2IeWqrG1{(`Im;eGqICeGnx3{?K0C|at1m!W;w0W`@hr8Y)kA^Dn4Sb zcrEEjL5({B*L>(jX-HgA62It3=Wm&OOa1w9?9g-UdU%zD7g*waCyU+C&V63+AfE{) zsFxU7Y%zq+tb$u3MQ-rXtC>Tej|7}E7M?&X9&@d?j>en)AK^qEW`G*_<{a=oCzyB42U_s3FTDDj;l zcdj{XNK!txRJ|LSqgKp>PXX@y$@LaFOt5s7QAO#iG3vgZifZyS!o%B+TtB9$k0amp z{nrcC;iy1S_A?zl&<%Z8&dO~F>)yWea!-ul&;GipZRY_l+f+74*69NqURDv9-c10< zq|AEg6JvyD^?krZLGnx0;S;xL_{OkO`N7ppd|$gqYWQFVGW*G@bhM_Sjo3o6`SDcr z=*n-r}b;MlCb=}XpMY@D~jYDJj|zK zgl{1};Je-v3uK@|#dL!iaD&tQ+xqP!}QJnGr>2DKso>j#;UMLkHldX4e3ml2q7?x@&#NFSP$a{GlMJlp*gm``D-F$vzD2?} zGZjx-ifmKgN*|Rc%C4=_GhbaZVdeyJGR~g(kon?JIJQe2EDc}v$DaoxHc>aZp^)DX z9&vReBu!sRsFXRHe0OJ~U@i&9i(`UV<-?)pT*$vWs$>{q;?D6liHGFR z|4J2xl0a&+>(RD~RIvGLPTa1!k_NAwYm*|lGr+py>v-I?ObE}`rMB*+!AMugNG@w4 zaP}pP_7xjJ(a-PC*{kHx>#fs7RIN4&Tj;M*YUw^7Z(i0|vjL88iC~^<)kh__pd&Y% zv~aAwW$`st8T73sXeazNVU26sNnMIA2h z(slofsknc)N-4{eB)s8UvV8k(9BS)MN zKjA$TkaHpl*kl#SWaU(##&F#Y&P@Y_kcDSc^xZL^LFPk=cP5yteqLw%l?gxmJj|^V za99-T@?C#>lp2X}?; zz9lW>f@-O#CrYp}*#2d~yedfES6JcJ)B%1bj}n~MfwOM}tN0%3!thA9{ooD_NG46R z9p&W%)8};_zJGL(l6FXHuSN`Z@4Fkc!7mf_tiv%@WXzTy-FK4+!%qp+rB9#yQ9OKQQvx5y-q+4sodh}iy(e-RQ{frkACEqIkSHRc^#1sN>F_(ivr>RA!+Uu)KW{F|gpZtK zHeBj4aAIV;dqoVuCAO})E=)W6<9kdH}zxp$<#gbyh@HYd`AN+Ok!G&}jW($Y$ zJ!xq0VoqOLG!^UI4PKwznuMqCU01E?i9=3K$)!)T(HO+@L4inC2*Vq7PoiG!B%!ot z&2`HtH$1MgZA8z^03{wLXtUom2I2UAqnkV2AnVq!m$(E8I{pgWX`x?ERHZapsXmAX z?)CGf`q^=i;cwL_)tCeaUH!zXmr`IZrE0`XF%2Yd#;Z^sq{G~Vm!$BG40s&%A193| z6O0wcq&+mS?7`Mkf)+Z+Z!okg za$21pOkDp`NYlXM4m%_-GwcG*ch3{nzV3$P*r*f2WoqDU`LyrEv>Ghuz1`{eRU10| z`jQUvD#Fbq{w`vM3mm<>k$ahLYC95cKSH)g;AYVXoiMQs9H{hSTkXie>Ag(RI-%+K zYPQ21)Y9lka5jlPNx}37s=qC_C*jA5ql>qvp_;UUB>lf_C*t>hT|E3v3Xc?|*dQ;;Hc3b#~-oyUzjZj)&*wYyX_3gS5 z?w!%FO?rN24|^PBSqgBsxg>#*{^mG`rzv1zb$I#9Z33=sY_>5xlMYKJoBpY#W&qDC zin4`T21IKQAJ^F#34@9+6yk`|8Bhutfc4!OUhY zT$4(6D$7?#qBh`<_A_x+eA-%h^v(kbNUYsv&g>u$r7!%p?0KmKFW*uFkDgZp+uy}1 z?#I>OR9LZXW4a=|uDX9^$n1~dl#DI=7dcaOyER^6BM^dA1;=--p6ST{VaZ3_AOq$9 zjc~Hl4a)1&-Qz*q=|#!c_vMOa3T|P$yE?xuiNH05ypu^s<4|4o;fs2^Xgr_U)Gbd7 zL(@_FgE>|Hs8X`M*0b3K9crTf8>P{WN}iw(3Kd51es;%)M?c*_Hw%xaX8D6)z2Vhi zr!ZhuT$%c?8Vz0gO}~C<#z9BL)C$3p1Pmws-e`K60>2xU_{oU`2rxe@E_Wf)p;`Tp zlbdS>Y;<_H{>3;Q0?W6JrBFg)Jx1n1)q(-Euo#b*&o3MD6HNGc%oma)O z)2|&aKaPUsprB;IY}up$e#2JuaZ-EebwetvA|tD?pz44F7|FyP#%6veQXl z6{y-cEcv5f5tN0?2So^3MRuV!nS%%kR)MY`L`T zb?sg{GDf|aHIYxpi_`8+`2>M+5uqXhf0FS^|L}U1R1$t`Vet#%i^K4175<;)qA_7b z;_a{6Fx)g+bz_I7KZg0d;1QZ}#;EFDce{Cwk^fHU9PzHz5ONRmy^i#EhkwD}t%FPa z!K+ZoX|6I9NOci4p%0@W>W2RrWv4jE-_+ULT$l)yF;g*~+7xgVd#xg6ML@iyEyfcs`@0E0q%U`+7kw8z<1U_KYDWzWM}=0sZlcq>nllS-7ad6%$;>@O93D8KwZwMGUH^XA^*bLWRREov#Jge-h6P4B2J+65aD zJVJ(c?uPp&2VXwT|r+W#Ja|6ZznA8`x?Z7Z~fY`Gj}3*6Va!M^Yr7>k$q-Wcjf3% z-nt+7xgrg8MbwhxTj?VcZ9h5$P(hb-DQ9}rALK_%zUK^@f{Vqx(bHcWA!^m*XVL^O z+FwzRh@zLBn6yYI@K$0C4l@p+33%1G5FNuxEOyz6a~)ob0iS9|q{T0SRd{t?A1pc|4`K>V&vd>NZd@&mZ!&_fYe9?)66ggs|-Z~Lpl3JX(_ay^ibx-~p z8v#C|6lUwRG?;7ryv0JC?g2ggr=Am@0(oxUIozqfptt8?z?Cb6IRuraGH<@M6ZnIz z6AWAVaLsMvv^=u{@@>1`x%Js@^scu%?A9TLdOWAxyUvK9S1zqf#dtde47?1uqQ4Vl z8mF%3WlF>2p1%(4APkMOW23>RC1KO7Bh_a0tD(c5vs-%E1u<}EjaaX<365_KXkpIx z!4zw=H=#sU5{ms4Uj9CshVwr!3;hyK!)*Jy{W&jFQPip6#_ZW-yh+M`qn?_8uQHyN zDmccX*kV+kr(QHZ_Bw7Bc1 zhRRi^t#*}Ec>JRL;bKG@G(Mj?m3Ax*G#^)Fr|MH6aC9gt$ioMOBFsY0-7|+4%TF#- z>C(X1Y3n@<0w}q%G-)m?fz(Zlg|1Vwc=?jjr8{ZFm2^?pZua zVtwAS4Zd&7EmoKl2U^rFe|dI3kkEY|@H=0O_qicGSL2L@*2#UxMHFy7Uhms* zmKAWVzBG?Basy=@(_bIxn~>qxub!V5!l7oT`(=`0G>mZhSqrCs8*{5xI zNU7n-Pu(@cv`a^HFMQ&Fy$x_puu1^t7#j7bD%e1`EBWi(AUB4uS4aPpQ9#1(OxZcL z!CJ%Dg!4q2<(Dq)Tjdl@~BFCVR8(#3<}FcUP^%T zJsi=NpD7^kBfS4_YYGhT7VmEsBS7Z3PL`|(LSAUaK{shZj&F-Mr0(y!H^0dg z=30gGOXB6A*YJ-Fj}r^Z_VOlQN#MpZ@ADE4J_Hvs(Rj>fmf7(LkfL0oNn+G;o`A=+o{@SpZLSv9T!iEt}oV7>7HP9gV zvg-H|E8N(#BJ}u?2Z|=%ANOO5MWt^ClTZCg#Vr2sD%{rzwCAt?$Lm5xqb(PAWfPE$ z&!)M*sZ*0MfxT>G$TS|mx^sV$_9kQQb8D4n4To_!VjttywP2*gf8sXn_Q8zyx>-Jc zC%O_kcl0-l8X9fwV7Q({znUHIWc&NV706nwOcjg%&@6IHSJpNHSg%fOr(e2tKz!m7N9jR!v@V&b7tS5;nb;vI^7|bN`V04NunCy% z;m^qT_^rWsza6;H0UnfW}^PJkxnVUH8sG+`Tn?zolT-p^sKa7ZP!{xYA1LZ7fCyoz;40M#db0b3RtOQTTeyH-}k07`NzNh)_J^ zgN{W+OGA&1E1JA1Es%FH!li23G1EeG7`~ME{aK_NC^`E+*30q(?arcIvITU-QE zKadP!r)_PjR$@UcOS`_C-scfXN^*h&$-vrGck|fW6wnX3K;iOB0jc-bL*6$=L;Mc` zR=%l&z;n8Jp)k}EG=sLR|MgUYVIurWi`$qaoGGbPJ5~M7uo4{;M89nS<6QpJw((zk zT%xu}OYC`IsJnf~)CU;>+<7ROJ9~*6|NQ>EZF|sC&!<1h=X~dW8crq3h$~Py;QZs- z?_9Js!$j-ZJ2W>rRD8X1x?=*+X**vXql*j9|GS;G|6v&R1v0RSt0f}w#pvwyL(&wC z+svxqES!kvDz{v#*d34dLi<|$BFNZL%M%|^6oy+<=Y@wd1M$_zTLW8c4x#yh8!6M` zme|Ht*eQQshJM?5t~U>DV8Nr6ba2fBn0Fo+n2`&D1?hj6-a#asJ6Gv(!8sbpE5a!| zhGQWnJZn;VV-f)pD&=Lvl@y>k3!QaOr9(GU7Nkb^1#@^=sEs}ZSZEel<=#qxa%=aP zo_EpUCqO-_p5g^8qNnZ%EA0c0yDwHAe$)fYp~Xj$fy%%k#H7oBVvr$t@=t~^GgL=0 z)kU`cFzkrXys+WVyy3mPcYhxCNnoKaW0uodLIMMr4Gm{#e5m_Qw7TM%0OI2(V@J|9 zpy{h7%abdjc=P7AGn*LqVBcnH$@Mu?d}#9Pm9w)8>iM%?c*PNdVayyM*X-g^k<;;M zc~JtstY~5Wdpr(3hjW#0oF?N9QK^S%BB7Z5`OMQ}OuktAhhg`|u>;s7i`pBBty*BJ z_`Lh@?}M;dA(oUoK!Uq#p{;7E;qda$vlEBtp3gp}e+E@*aUe{r*1m{I1mUK=-&gih zKxr%MzEz225X#)`y|ID*x()jt+Y`nF9OAgqog+m+)zckH?h0w(#uV@4w@Lt~)dZ)6 zN-{Lwwt42`7zKZ;Y4-_f22a3CB^%3*+Jd~6_e+VJ`mnj`!_%>6GH~XK&UEA^LE!$o z&b5?3-;*}~c<=b*cZRb6?d?8%c-&BG^uYPHi_*A3t@T8;;4YjQ{!yqfAcrB!?Ry>t ztKf#w2gK}eRh+!G_Ys4#A!aISO?y8z$9eBs8KZb7RR8jYxb#lV7uz1#sm0qx;a}3# z*|B5_KDP2Ovu{jAb*nm+*1M^Ajd#ZT`2J)xT0S;$;8g-%?z+CG`(zxQ4TFvpN=4)Q zy^+r%w+3O}`_&$yghNOU_`c{qb&7(7<7zH_dJ_93RwAUg*wfWILEgm~~EXjMEpG^^l@_x>Cb#sB>QNGD-tb7>W zWgQXuf(L_+ZM+z%DTGtr3LhU_*o5Q6VdL`3$)ASBt_iP}WQDO{`xx_)KLU6k_^(oG zrLl4@P|M)F5>6GBJ@1^-LUBdj$~HX{WPD%XJ~w2CZ{uPeBk2>;V3biNw>S{(9y~sO zF)kJbV$Jw?+0*c*%oly}|1!}}&9zc>G7F{4YKHD~Wn-iwdW0kKPb1(xf z^rIQK8mHlgXVjh7A z0ZCC$>vDDnVZDg_EdA9C40QZ=(CKq3R^i#1)VVa&5mbHckdT3rUmrWHwbPL0zeBGC zd9!iFV8qXaKL-t%){aq^a#6u$%kjKTd3c3sfFyVN2>y|6eLohMi`AYbO!Y)U$1FL;ib_n1-pcN&}f?`W5Rzd(0`%-8nvEI zN}{?{-#d|@b7HWEm40xbxKXIv#w{8i8%E{Ed-XXEjQ6 zpB)IJ^}0@z1)ZEWBoN8-05QXbQ}?dvfyBn+`pwIlAh2Wh=7NVN_*{Ozjq+6n_uoCP zw)GYhwh6@*Jl`k=;v|w;jFkq|3A(-8E@=v37p-n&GuXr42}kpD0d%3UEx0-Mv=1C{ z_{=hR!5`cbendQK@P~+RzsjE7@q+EIym%wHz2UUX58{CSR0xDV{L?*uFb2$IDJIS} z6o`r#lqri&2c69B;y!u^eMKeWOb~lEg!)|-KGBm4QO-#*_w|oL@9)EnyE^hA#HV%F z!SLg-XivW~6DxqJZBL71vkSm%cKo&Ug931wS7=p=J_-MQ3n&#haS~FKCR>Q%0*Eyu zex%g#6o9R;$6hzZ)^RBXKZOu9A# z>bA(M%4GV3-nlLH&o&+eT{h#r8qM}_u-Z-1C65m6`NHn=X@=n5`9`taO&#`>J1DyC zH9((DWgfd;7@@7}+s0p+M)-=zKe$7~-w5&Qn|~%Jby1|ttUWSY5npLELKg2XD9=55 zwP&v_e07;Svzf&MKCWtWn2QI%rF9+K-VI@J-nf+6hcgZY^g?<)6Vic`!nBwjng{Bm zM-q}N3L%}FYLmu&4!*e@jXBV90qod3j%b#ag3rp=fXI90K!|1Ot4ix#f$#3LE9`%+ z09vR|Yz?Y}^>5p+w#QXMV~*LN$mkXNv6u;4d29tlx`=OUT`q$HUg5h%&ZTf%xHHl@ z>jE5=GWV0_ItOXLO_Ga6P6Lb4{=bTh1<>VGxX;NX2WG$c77BP#fzlKZ>b4f1l4+f}?Ib8d$T`=LqdY`L50anOuI{ zd{tffx1kZ*xiYov%eO(DSY5{AhmJ^{&@}kR?1gGJeY{RiB)rzLY};cHjt4e3v)rTK zk}8>I-af{fNPqL`!Z#KAI({?QZM#+$J?)P4KUXH#kd0qObIA*&BXlLShonmH)2Gk9 z?9AwWT$Ti|mEPyZwZ;cetW$7YUe96U!$|Bc996kYPfkL=Fy32M3WQq23q?nqVqp97 zzhTa*L>M!R;QjnA8Ep1!jpUq5ft_b2xu5)|fX!%2p?z~K_-V9CO%XyN;P#^`Y$Mx2 zXkwRk*5S~EIU~Nl_x<*$e_(7-SNjkKe>R+>Z45vI5!e4tu|}e`$B%RtKQbm)vx_t@ zkg?O#=;2%GXf)aXBI7%w|hQRopzNs9?H|{Pd?;?PrP-*9-XvAqnw6U zLBSfB7B-MTY>WQclUMb5q;En9LmNKb*lVqi`t`y5?K}ri=FQQuXOlii>QG?}bx5ohW%Vw-vDRro!?nPtkJy`0>1P!2@z#YF+%>EVtxsE#qV7r<|JR1W3)NGb z^4gKx`%e$|&33#vGjQ#7PCEva&Pon{Z9`@C=ccFMw_@>8#`V?I7JAKAXu8ie;lhvU z;OK1)sJphS<mN1+%Gm(=vFJDkQGNuuj|yTl0` zAuoE(Rb``3wv<+}DS^E{Tb5NvWAI!nwPkfY5HBApa(=zd1tUfyZvK+f!;9m-iKBB8 zAoTd;q5NunI4@Y2d?U{g7{{7gUR^!_J5P5VpOW;1_c5xZSbu-enNS%Mp9q2>(uG4l zr$ZrG#Q%4RRS0Y$t_55EUGs$<8>cH6G8AFE|Iu%{gQ2K>!P+`MEggf71Z$oYEl`^y^to934m;9qNsp^=ZSw!0V-5Uz$-9DPvm? zHQ7#Ow<`hJ!A+68081@fA?5*ZfkTPfNbO^cv`$W^OH_O(5 z!)UHE!B+>r^k`|b<3uA2zrNA&x3n3&9z1!^A=?VUUnh15A83QDlq)1J)^?Z-|M0@> z?Ef8HcRZEv8&;HvP%0TEN>b^IN~O9rm01cQTauKW-6=a{WRtx^l#zYgWFF(l$T}P| zJG=Ou&+o72^FGh>&-;w`z3%(It}7R~wlbWjx%1%6s)6D4H+c}F6aA|>BM&+-=khnGOi@Us zBdBIUIbEB7AVBu!}qX?W(^%_&CNp7%4phA9MfFi|T5)Y$Uq-*LFQ@ z-1cLvvG*y~^{L65p3uko@y@%1ef`R~%tBj7&X$4j^TDo`dKMs1Pdvo1z3hHTNlp>xY{y zg;uH{S=r(%D`y>OybC;kYp@obfQ=Gb+ka{ zMxsC?Zwu7IR99476Lc-^kT-B>fN-vXs+PezkmX4Ua2cqE8)a56`GP7S>#3HJw`D1) z21OK9?a2rAXD=G)PJe^PZl=bJhd#q8$J;h3*|Cu8V;UT691K{*{coD`9voDJl_%A0 z;k7TJGcvVY8<=9+^|IA2qIYS@#+Z{Tj+u!c=kU`+hn=J2>!xbBv{lK|N2iTf&KQuo zcumpx!Jcf^Ejx@Sj=f~GcEKkmN+ixPZ*+FxqSe(2z#A!C+aETD;+L1H0mk}K=ts|@ z+}Iq8#;=~=3>%Hd9uIlbW%fjDB?yN4-tJDo<;0q&35{_Wvn^CS&><2ams zLYBVIFT>E2B6M1v!x=^|E6cvhUgjAUbt?CIYrY+!^@=G&z#)o#LL zv8;lTI5q;j-Cg~;x?dIJhZkg(wk$BJ@%U7zlq(Ke%+Qyf4Z!+Uub1~_qLK4b;X~>A zWQ=|5rt*mUD{4#T?VWc0j&kaRi&t3?+14~wo>x}lzPwiRF@-v8_uT8txYme@jDn%0 zdLsUb9qyW0X~VJke@yNA}I*bM|B>>^gL)p*qq8$JK<2ycngha?#i_g;yHl9Cp0B zY-$YkeatIye{5mrMc%nr-BB7@$f_$GwS!oIKF+DnpBYu4 z!-{c5mwg?IzEZsLGp89vc2lvrN*mVMe26>DOhS(?+V)7fpSaA}wEy{N7k=R`)xGhp z8|A$%jvh+w#wM;K*~A)Z|I7im9L}HU;ygC1kmTz%k`Q%;LzaZIx{_4Z+p4Eo zWhS#Ke6phBIJ{vFwYvwX)HfHXImc;Ff8Zl<2$+^tyGFu}X{YsZtt9ZASdQ7XmI z2J})Y?H(8a)BR!olBzwxS9jZxfq?|(4sJ4Xf%QkNM&hmj zgAlhuY!mZtVfyeLk+=GaQGGW0Z4IV!cx0kkb%)Hy+GOO`qNtZqV(?-bQ72m`824sW zoZUU;i8JZ#wfomq;4fTcd`Fe*g&w$@;AvuuD=LI*@sEpRHy>Ccd~cS2mL@%rDJ4`bFIip5ZL6E=tSv=qR^Ari>8gPeG_$q z7!hz%)23_~2N;)^FQklO>aep1pY8-Ms+_7RZ<@mJoMPqqwrQ$SkxZV7{w(?tPQ6xh zl9xsWB@n>l0+f9do?Y!3BOVzh>(XYqh9ahFuqGLn#am~c?0WIyd#2~bbe;Hcv0K1#S1Vp> z^mMgW_<`axd7+7OE5pN~s~mPel!GjP*)VJmAg1_#nt7s{Xcq2S*0KWiMU z(ZftdEa~%Q%>O!ef$tnuWa<;an!`ZFhP95Wm!3vIhJDptt<+eE&`&G8lbis>W=0{l z!HE#n&zO+S6%WnE*K7_NhCtxso9tvNIVJm^WVUu`G64;@ZJefTFTxU+rwii`Yf+*% z9~a_VF-5JFbDv8WzUG~ld$B%%HiIuKkGBrvCHtT0%}!%DdB*e`ZRaE!NxNP&44B64 ztip^79<%7bYlUt6)*PBfhG+QPp2JG(A(x~#v&b=_TCVLrjTT!^wv|^*qAph~N&6XL z3^S#K_Hbtpa5V@JXMG`Q{yK` zb&_!0Ad3I*eH|>Uu=%=&CJekqcXu7jjR)qLv$Ag(lc4BNXSK;z0^HJ@xY;}z3qNLP zHWbPt;Khq+L)O~?&}^vkJ(tjL3nzmEcrP&cLbTu4zR%Yp;aD@5wrYF|SS)Ynwz!lH z)6HvqwxY!lu+Z%|@~IMhn>n_m80+D0^EZR*0Yo_Q>2ZFEdjGi66Z>{M z^+Gt4kV$@MKUB(`IZhiq2q|WR5BUOzKtd?zM5h!P#?NGQ5$dzakioyhK6{J|k9%)I z;UXEj9B4V!=gGj%$8qYw92q`X9SI{Yk->ei$9A8>mRt!PT%Z*6CvO%~Xd@sd%xS#qv?y?|5;;V|HT~v~b z5@9!wSqCUS*mYGTg9z#la)Lvm^`Q9S(zr-*B~?}*DK)553?uKt9i-K>LHs@Zo@q^i zp#@jTcDqP8_|c&;U%?mD7k9*41Uuv4i;$)9Tu)>WojR7b%Ns{G;|c;!I%9&g4NpP9 zQxLrO-FNq&FnHx}uhvwQ3EJZXMnN0j5?Bz54;b242V?5F=U;EOLg{pKtHJednCWP( z{nR@M`7&!>biov`ms9UrOPPe``sv%#1GB(dp?W%-umIev`mW~&|G>Z(h314m%OJZD zXnrSV6$F`sAL=x&!GX)sM}rO5A?wJW!-LP)p_Np|Hr=`grwKRqiZms!f;kH_k!Nce z%KF`m6{r8e^SC$t`X3hHaH7v8-G8$XX)71pTQLbP`jj%IFBB;DD~=7C8-$BV$IWP- zb_2w-sTOg!!MNPI$Mzd_ARlM?SfjH99$MV3q-V_nK5-#wxi{hPo33Hj#nccYFP`2; zq7oem(HVC7f;C>)L|>P)8SIJjlm|0X?7qlVKN9c;?rGH}83m3Kt9jLGDh6|_My#w7k;g<2# z%+I53u&u%V?a_`OFyb<}#+_UOt1&-P);(juO;k8Rz9kW*RS#8M%O(_Jr;bXVw(o&8@XZ!O9&N}SowUyWMg^XtvmDv)U*GlAG$f)`o**V6^Q zWB16A&eVDq(rlA0Wl&GXGvdTQB)v%Nn)B0GpuGscE>wCordwgAm0}6!KR-;Vc<#^b z9fwhg55DUYj($bL9^+&^+hPnCypa?&R*f`Ru6Q7?1z)rT{+1_^&`7~7FKH4T;&vQ6EIf;|1`TG*ne(WmL%OIRwTQ!O{uwEXOL%2! zPRX=r83!tN6UePA$oRu{y~Ar2Wpd+P;p2?1%PMY{a)9r{0r~<~`LY#KlDjT{A33o=tYq z-G{!S(sf%1bdJNNkZp2=RX?oDyRu|jV1=jKbt~`g{|JPJ{bd%Zk?>4K(Eq+?0(h4^ zOlY@Eh2j&-g#sq&V9@g_ecwnL>}}IlwwH`P$7*>xOwnEm`HXke`f#fsm%|^i< zaCmytd&R6D&P~#4K0P%AqW7eH+K6Ph5$Z>dIygcVBxt*CUmu}j#+PTu>?u(Djm8wx zDDcJFH|%;8g^K?<>#B>*D8TO_*-E%W0S$o*wWItLa9AKEFrT5o-9p`svAYzwY#mB9 zilo4hk)H1NEb4rT!rc)i6yU038|(6=fL~Pgu2N+<$U68Qun5Q945KuHCpt=WT$Z!J$_$+=p5b+PiIuIyz~& zN?{HE4)^7S1?nzWL51u%x0+51{Qmt|dE`qc=<0C4bYLEUwiv$yHHsr}Pl?;{Uibve z*$f?@6rP2BC-(;Yda(e6UHcE%oLmAs4o)@3%oVVocoubVa1BIX6_#C!-XOs0J8c_v z(qAB|6htK0{DYaw`<~;})}tcuc24(S@O&~Iv=P4nYg)DceW9$uJ{!-~SGg;|v5mJ+ zT6hUUubk8JGg$yh87?!+3$wsdwCbu6GXV;v^kG%X)D3H8c^Az-07(VQ^L9C%P#jX~ zzVmqtuynpBe*vy4p!t?0Y8w3=#zoYaHK-@(a9;;)n7pGJ2iAQ4JOWY26nqi4a^QmT>oYCpCh8>VF`k2W;+Ye}1~u2k)GGGnc;% zKv$sUID5u`B9QMY9w`0O2X@V}4TjW-7pp%7Py{-`$XGdJtCk4P#>e*TV5)&P{ZV*b zB^TcQv2*$>r31(QvDndWW?@c1!y8lmQmQzjW2bCXB{~Iot0WW1x-YLEdozxeO7qX;bRH+=J4&`!S3x_zfm^( zx9ZOqgnS~u_RU!`G>8}MVB7l(v|1y2aAylJADMq0N}{GeZko-9Rk%S5Y441Ib~s#( zxN{+CgBl>#rARI>l0fa@HQVCv{lGpb|KaxAA^7%QsLt`&5XeTVFl>hY0#6xJna^=; z@O1J{-BeH!+!rNY{ZGyc^q=KCD3hLS`K*~n)1VjB#sLa~A?(9W|&$1&8 zci)atGtM-m9BNQsBC!(i@#+-JBoR4j*Cs%ln7I8z-YC>%no=~=hG6X43tGMSPGIIJ zyK_jd5^}uVZ>R-Ff`&3fM?7sXz^VFT-b*pmqJqufFNs_jlal=`ntkfEh1!FW0p#5r5VAI?O;)tKAs zMTt?M)4dfRijg3eLcY!D84ky`t-Bqc3xU74BdpCdLqPmfNY_=yP#_8<{u9Uxh4f0o z4%(?uxPGWyy^;FNnV}J7A`=R?TUJB}2go6?@GK-+I64H(vV`u~xr9J`o7Z7&w-6wi zaE9%V4S}8i_0+SHsDWJ%7PE7O0u<~e67)i;OZ#W)L3AkG-xSS!QyU5@ieEma_JzW` z|J=$;$3vkhl*qC^7Yc!vfx6y|kIZ%SUOd(3Rj~~;q%)+Kp>Yi~WC*xcw~xRmPQRnh zeg@8O6Q!fun1poV>blAJ2$b}X1wL9L0qbF*A5P{5puNc>arW#FXuc=mM5Y~rrIkBf zb5UdPK~_q1^UM^O=~+*o3HlAB?KT#+Otji4nX*SI?BIg#=+aa-fbXt8i=*8kL+@s1MG62{QGPk zN_unqDBtHnU2SLNazK8a0 z4fJw+iWgk-LndqGTdS(IuHx4T2=gfI$n9%o z{%rAZ@@M~e-WPXRtDiQqu4)9acY9P7PWOT`S?HFe>k!#Hqp+G=Jqd_naIEub`F*$qUdVhF*vd~B5x#VAAN3gO- zg#+gm2G{*ilb6=nWP1?a*mGsevS8{+>n&D>EMmY%H*HyYnss?YDtmOpH zY9QW}I>Rtl4wuQL0~H(jpvkbJ7gw1MhMcso&6#^wbR!C5<(1b31^&b+#sF znj1vF(Do?&90c>;j|GjS&0~QjX@ob(D;W&W1=BrpPJ?&#=dO%or9;0}Bw6W28qj#6 zczH?+X#L{hnjj>C{e#Mt{dI9b(ht;%oR0yKt*Q3UdtzWkPFv~xNHmC_R*nB99|OfM P2aTV%#r*#aS_%ITYJ~t> delta 23828 zcmWJscR1C56iy|Hh8-e%Mz#p&+N&fKvbpw%YhT>!a_y2tLnpbs!&Uw%Kes1S)x}Co%r;)1EtNLO82?=mEB~)jS_JEm=60Z`! z1Sl>&II!JK1twkv-*?|cfO{P(cQq1@peB#iYuREV{`fxfac*Y_PC5w%PAErU`E8bX z_GfV@tZRL^s~`^7c2$@@{t|=fGZnwdiBZ`2I#kk=GXn2Qx61{`gkjmEM^A7xn2NdY zXW!zrk!6P_11>#=X$8t106O1=tdVbc_O!yBp^jHK<8d%DLO z9;N>Ds0}>hw0*@uCMU!1+%i*UI3CE>Y= zYZT<>)EDg-i2<>}?nT{jY8>qQBDwebyEu5z%wF286#;|?ttJy9A#gFbvFLIs5kAui;iZ{frg5LLzh*WD7eWW`x@LOpCB?`OTW?L^r8cJM0GN)YI;YaJLoM*?q6_wwfQD3DBW)O~P^44d=ENdlKBFyzcy zrlm#$(sT1_caCImU9A4x)kcR?Cf*f&E6HH>E^>2_OahQQ?an!GW0ZVp;ioRgN!X#e zW|LFm45y_RZfw<3!y@<1R1w3yhB)me`1AdkArgW@JxrTOVm_a#Hf3ts>4wY4q?&L&-Oynh_Mio~;6eg)%`|Lj)Hmm`sUxkiGo zApt+Kx|eV8rQ`Be<_8?F=vc}UE%uZ*8J&Av4Pyy3y!w!Oll^@y1&gOjMv0ANbd5`{ z=Drz)Phw`ZPvwyCfZoxw!K}gfh;!~rU^5Y`FIlY@6+2+6us^#M7Xcr#3&rHT(1(uQ z!Fwc}9HCq4=+(AAMDQt?vkK4*h8OCT8~8Y(DZ2FS8`^zmtZnf_;5^}eny8oH;Xz&0$KmwSEyzKk2H45r&9{Lx` zgg^?r;*HYv?6fALlg2z10Sf z2U~wJ`%!S`t-ni0-qNxE)#BN=qjbz;v#6HYlZ^KrYfCnm(lGu`Qd4~^1*iEl|J6Sw zWA(|(*jBONo>WW%<}CSyX%0p_tIG}O3!fjIDtf~t<@ zyNQcr6iLqJ8u}Q850g!J74MTU|Acsh^Y2N1D;pjCH;2s+pEJ#?mmfT*!(JA0Z0S%P)8RH55Z;Mb>iDd7co)4?fnPsd5!6qs4$oawfp!r}7cN25n0 z!SaGlcu`g$Ea?%&gB497xWpmm%vVF4&uFy!-DHfHzLNoYr|itsFkEuJCTxt{rX~^kH69}sCG`wOeq=7eJ@=Xwxi*70n_J`0~9>d z+LwQfDGqP9eRp|28-?$!>@BX`A)%%@Rb+C+BN&&+`Uh?Zd1Kqx7nyN&j!2gI!msdB zA1!9d35NOv2>iL~?12uT{;KuNn+hU0T~>W%IT8e$Yj5eV6_8-l^UB#n-BCb|Vc)*c zM~301F5X-QC~IW2RgdY=!16iWn^Qa)X6ygc5t*QalK%HE3P~yOi~8dCl+ory*d->S z^|U<_=mpi@WzYN}Y|qs`TBR1$VZc~_pfSR%P=4MWV zY*#!Sr<-u*Oq-pD~8cy_&^GW;FOnE--qxFBy)hJk24^(7`h1%NO^K6ev8xHGa`M5vU6r zqpx0x1Z&qXo6;WnLWr6SN7*Ly3U7+!D$yBk|Xi?}M%ziRfyAL@SB0;qW+L{vHLc zru2<3=E=aH)?GF5k^)R6!W2VW8Zgay=2Hn8$zaJ-b7J@l9V)jgah5EmfPc(z=~8zh zbPx$E?*bzsrDWrYqXFK)>(5yjs4R{*pV%r02btgp){}~JzNXlu&YR03V~YIiOlmHZ zCRh=17e+$Nz^UKn(c>Nlom&U8GvArO!v_X@<+n}2{p8!>##IyGV(p<`p8lr+8#VWo zMTB``JI99&3OgcEgL?OO6$=e3H`rge%$tKmw<4H7g7Y;3I@ ziGn$Fcjg#m@cX`v>cB$^bT@~uOdX{Gaq^62tZFheRtLN^WmM8&>~w7jTPj3;mhagl zK?AB?!;KaBNDz$@xW-xU36JjC-MeS6fgh=#l4d*?Ot9+r{nfjjrnnxW`cpmB6b0P5 z{qh;#CvM#6w^}s?-Kh5;G&@ZoW{VtqA(JT_ko|n^GUIz`QgX?{K~vD`X#Hi$O@N`! z#{yNHZg_>_^Ze1h2z(dQcIK4}4G%1L-kViQMWrtLxu|(M@~stiG#Di#RpqGG)LADQ z(p!ierQawR^sHO?kZ8e_$Zu}GQQ=Z#NGHFzqg z9kJz}*pVqI1N0d<+n{uu0Igb*d%1HQprf+j)TeSH>|OS(m)aN%18RncTP~BJ!R_pp z3(un9A5&3#R2P{FDr`BTN%trqH}Loh-I@ktC$_R@GWe~1dgoE{6djVh*h2P6rNS$H zqSkl>4cu(=9&wIDKzzYs8R4iKkebI0R-*tHK7CU^l4Odhk(L4j%w{-{wC!VVy(y|~ zvKLA2FhPPk(+_($Q=qvE0y)bBmR66DYrUzapcj8k(B`Wt{8#ku!NXgo@R*~j$9<5nqm77N(IsW>MnkpI^+Wa^3TrU-`ZK`GGVSutynKbcDI~ntL{&@M= zo`&JyYo?e#P%z_SX9Uj_8K=CC2S`pv;m(c{o5sf^9C~-x;HgMaE8J$vQ zXW_%oQQ%DLWhK2KgTSvm`hhMA42!$weRiP1k%s7-)%%m7r=3-0WSkBm(l`HY*Gq-_ zY2{Ka?KHS1O1wk0B}Kqc3)%FKpd(CMKbX?VHNg4T%(gz?O;OA^Ra;ro3|Wi6H*Hxq z#fsmC59WPM@Cv(0ht4KbSi2ZsrEtImxI-5_I`d88hMLdO&%$PKEq~dPquUf}C?k!Y z!p2Z{Rk@{X+6L1XxCpa#;rMi$Lv-?I8gBl!i~4klkcys}Iwn5<(J{xKsW((P84tSt z_TfEBLzb$r%tx;%XpH;m%!6dS?qGN7%mAa5*L_=@>PgsnMu{}a6^y@5@qfI3kB9+# z-qCM%IpFf|E)jNV0mocIM_Juo`|ylbsBQJzSS%_>^ryrRruPV>aVj!2F5%1yj7$cyPn52s13>epco; zV2bRPJR!TaOi-B0dN|YE-URj=>{7I~HGw~W$9E+LnL?3KtwxlI862jZDYfe`g*$WU zZ^ng8z`5XWnnZ*JidX*WiTp}JLG8a^>m-x#jAFdTas5;@X`C&%`ks!f;vQ@gipfa$ z?-sejhK7ea>?D~#QZQZMw}LNY<~-fKa7V|yo{d$i8Omb#lKd(DLt2j-a|2YxbuG}V3RD;2Pz~>BW z6A7%gC{H=RjDmX~E)B=ekRfy>JaB;l$^@gA%4ck7(3f%C^OH<6P&zuIf4!trfh;j< zrF1A2zMtB2gIg;JhHdk zzx(D`txWN^Wvh2L1C)nnyY1gZ7()#0iKx7=3Fs^?iyb&@3Zh|yM=ExiL5l|S(~FGX zXYYjB%`1E<$hzPXneld;yu+v!d2FBBCBgK)@zt7 z3lhAs!pz4;|DGexM7Om#o-javQmL`W8bEdP6P?f0*ACE9r1j6Kj0i8X0vz9u2f?F< zkJ5dvl3*>a9hR%3z~Wa@zePV8)c4F;7%-N~D!KZfcZ_M!o!7igN+=l;Hl#k9eN2a7 zq0xp-7O9}zcswSY2fbZ+N7>UH;UA$mS?enitkm7#XNmGAq|d|vE6>oP&%unTX*Io z9sWB#_R7U5748h*P2ndeL2-}I10q)#q@K3C;-;bid%k$~7Q8UV&$o^X^eLL5G+4G> zb~nSKF{{+wRVFAfFIV~KyD|Pg_A7ewKSS`7?VUDlHv;~OhKJA9OrcNJlw=fP3J%oi z^0o*SGuUvs1*48v)fQ-`4pAY?d8im2a2U6J6NXYY5A>irv zAap%R8X(p&9K;HIr#U9~O(0C{52D!P@%Yb`@@Kup2OW zB_bXScht5gvh2PmNMVyEUyhSHfFmAfJiXc*kJdQ zu>t9DNznRC@kQ*!=7(h_*c+DocNmOOid|$=orxj1w7PK^Z8d^?If~bf(9S{G?xcZDM88PnE86^W6w$IAU^k1T2sDe$i*ccfpdUIR{-ZJEO*BP@{r6kpY4*v~gj6Vcf92XBmrBQEE%{VpoiN8J~0@xR*$kT{>l_I8IOM8tWFhjIf%1aW9VW! zGWHx&gmC1-optI#nymZJ{|TSMMZqYXjTi92Tfv=Ll> zB^Fln%>@3t`BH9|hzW3*Ngi4@GlD%lS{$o8HDGksy7t6=Z{(r6b4OV7C!z7RJ}s-2 z6bw>!wXV;kqs=AdvpE}*u`{mdy&xN z4T1U4dH+-%6A+Sn)?@j_7~*@mbPLuDp@Z*||2&r_1b5uj>TUJF*_Zt_%Fk#h)$n5P zt-cg&{hhpXZxkIBLJIc2TS&sd8P*4!+%)vP=F?ynOF`>L2F^~|R5GS(KjgTX5rveU zFCFfjXRK5d0WOWEAlwi*=sulIM9Xop?v{23bjxz{?R=nz)_$hfMXu<>xk2qR{{I}{ z1xsrD?Rg?_R*VRLmk5S;ag9zYmq=iY7qu{!geea%PGThPFyk;Lnfv4T2A-W7u z>Xq*Kb%i>e1P3H#`ltzX5Ie8qSJ;yR>n~MioxjlFc2)LqzH^>n%A~`V?x%&uHp%8x zRYOd!(PO@EYl1br+e&i1jIm_R!6JRt2${R3Lp{G3pw$e4BSeA#SwUvmq+)&0)qHfC z>xv-++qXY|&}9Tu9QIz@8jZm;gq=jK(Km$JNuxZm2@NP*3o@)$a={hfW5QAXG@MhZ z`+o9j3jR!>zBw04NBvLHl6+%HSRfPgdx49Fea5CLT<0iw^(0?qSt1#$Bu(C0l|^BG z@ZWc)jU;R!He4@#AB2^jyFSE!CSqRng-ekPdS|Ofzg5!L$9LU2)H)6a0$j8|%W~bw z0nRtk&3I1}VO!H_@rd>y*x8|^SmH#2$!_Vueo7SNX-*Da2qA-Rg$Qeb9tF-yJN5SR z(_rY<3*qM8B=}7}9oBZ74oB^W1}e)_AhO12>|hKHe%5)2$#%E`j~1tGY@HUm9LN_n zd}Dx{%Bl51g5k!Pp5nRmgvS_%o0KM$9gHw&M|v&OAAOvve+CQ|v6OWT4@~Ik(MG^JU(08hDsgM9aS9^n= zKX!n$wJn=TNkoulXL=IW7zDH8y+w&OB)A>I_HfZN3LJG>fA}*hNtMb!YvL_Jfsm*5 zw~g6o;KKUSY3Ozm_{2;296dmXPU^V;zQhzbOHc0E@+}cI7i3hcc{(s!tAP9!HBH=g z(v{$xXMkg-OS>WjjqrBjdnprnW8BsreaxKG5M7T49+$G#$M-Q$F6xfxfRGHUe~B&u zm#)urp|E&wY6#xGt}fFYc)ia~a+>6=CWCtme5zel{52bx8Fv zeOu^mS0=si{>8LEWVfR{Y63U%x80KD0#N|5{|LSBYxH&7UunNf7)ybnJ z92$k!Qr;~;)FWZUU)`?9aY5+tn#p)Jj7r314PWW?gATYc=zX!GJ^|zYXvio3(}(9A z^@VP?9ih{^tA%in2$D3zg7MKHV6t9QI+98PA^M&{Zib;4z@w0vp2{#Hwh#>~)hXaE zZMk&kdm@a#G%}LAl?17@AD{2P>vuZ0y@LAK2AW@ZCW{8D z;@sj{MWID~y!V{)%!W}q>6C!Qz>`M!=GWtE{nPq*<5?ev$2b8!Z@2C&chLrJ-|bm) zi+bQMdG~z-O&=QeuKw&ftq*lvWQr2IA&fZMd`sQQz=RIVWy>NpXg4@Z%+WoJf;`UV zKO8BzalL4jief4F^lYEB7AGA`Rx6G+QIqgx(bKHg-xBfr!;kKB8!7na0GY68Nyen& znJYoTQK%%hF|g2pgnnUDE783{_-ja>tNJPt6Q!J-YeO86d!X*shHQQ8&sTmnKqr9u zK{np47LGuU?iIV~N(6h!^kjSAAaH0Dr5ES>@%;1N!*;n|K<*$^aYG0|s_GFx;`lG5H(9`dDcH>u1t2z&(7_8P=S8nqb*j zJN;Q*7Yc6Nr1pvsV1};K`&L>XSgOVsS5p~aT-7{W^GO$$N8aW6A5eiG#ox-hwN24^ zKYO39aXe;gU83Jzr{ks15u=(@$+$YYu=Nlr343c&?3Nh(&$2h#5;2~DoGhhd;Asdo!KDtu&}-NqKb$ek0L!r3mmX}-zUKndMu>Fq=Ia2oq8}d{SIUpi%qq!bY_ykZ$hlw^@#bOR=R_ zW42LXV4tRrT{#1k=>f0YgOb4ALDGpfl?>F%A62N^{&=vaPAYJx8^g+B zrHk_w`*6pdMI)hefQrAlJ|KqyrY$flEB??&>WA^rTh0V5X34mHT0sY&|I+dLldA@z zqWP(>wGINoeX@j&TMvAkJ!WNJ5n$>>_4xKn0C&vGYQ?K{;J$Q`vcSYXDmeZp^(WvJ z!?hZ%>42?d-10`H@5}=_#%Y!*J+Df}=QsCD^qok;=wKP;mc~TXm;W4RoSlHdZLROB z_mlAyweAm5E(+a3GXBdJCLy79qg$eH5H5&J=-gl-qA06KrCg~!PX9Y`l240(VrTq9 z8?F<8+4=})HwV=bK5nTu`gDp2vHGz|`}Q$BowMcD9^NDfdKF{y%{B`9UUA=&pNR#- z-Z2aR`2;x1Iqh|@G7+4r-lqi{CV|_`V`1;XWRTWX)OnaghcTDAl|8u(H|>bQ$rD}r z5a)gkaj-vo+i2KWPNNF>a&9o9l!dB2ZW z#gLV!5lU8TF-bl{HNC{fVwAawYT?V-+#12Loa zJI6j9d?P;JwB=qj${E*|nRwIj?qxwbH#He4cedY)H%P)9_8oouQxcID(J@E$NKU{f z$Aq`5v5@iG)#95=4A0k4QiwHLl!V{tpOW9W1tEp+L{ZR^7c##L+Co{F&ud_8M4x}DUFBdAdCJ_)7fIcq#*iP zM*#swwN{39mdm2ob-R(!8wb!(U1x=SgMccsnJNACy4b@N%sSJkgVdwG8TW5#BGxy| zjZu`~x3L)gjF$$y;n{vxo2(5Rby8Bc>gWP_AN87shS)*Kyv#X!q4WT-t)0-67Z+e$ zv6Mc%bk)M|H+BE)3X8eu8v6H!B@=#STx z1k@E)F531g7B_lcUU>K+5_|WqC5C#Cu9>`1Y$(&`3t#s{TP-l6+fxG z3DQTolFnmgJ_I!iH zmq~?0=qp9=_P2hUGphm(_vIJ4HMD?(%jq)CYQd9v!E(MA+EDUet98#Vb?`km(v-!^ z2VS~0P9A@?&^*me;*fecCR%j)Zt!G)(dhisNzG&`-Vi#ih+3kl?qF z8IgeL#-i!#wz2p^q;hNEgGhX(L3#2;nS{?ppIP0f1~SGqW&Z0|FRbrh+*Hf$h+)T{ z_?&;Ihgm06f0hXuK)lS}hcl!0u-;rw^*>Am9*VAZYIqQMcc}|W^@PFI>;V3Xq$QP+`l<^)(~|LfGjFcPbU6hCJFM}rEYbD01`hVSLuJXD{b3=eqwEx6Re zL1^1f$I@^Bwf$AS`R-zvy4U%*(zpiRp{CvcYNdr0c?DBUs@hn6D2ct=TN5{#wYzg1 z)Wk8F`5AZ9eQ>`@E#01~s{};h^=ZWf4Y*Z!WJB(G4VdX_Yj%-82pM&gTeF#z;kW~M zz5T}zXVzx_zMs;=9kL1|kGn$gWJ=-TwZ};)Y(h3;T2I2u?rT?`7c$Q3>_ITRKM}cx zW^^R?Cg3SA-LbPQu{e15wn}ANB+9Lc&3~N^!+6P0^3*u_KvYS275Z*B5vdOj4kaDSHrusrK?5fvkX_NYK-10y-XkyvM@@;nR--d`%x zNsWX7E3-O@x>&e(+*7n-J|1Go4C7sj}KeC(?G-1$@&?_A>p@t1= zx4-Ao)sfF?m-sa%X}I-vD0=0`J~$R1dPb;N6}FoXcAS||h0y2^yFGs#fSiDi*b}@8 zpdG<~k2=K#m-JhadqE$Qt*H*b{C5Q7xxK+!fg+3{Q|ZFK)RKhzt+$401<+7vy2Th& z6EX8X=dHc3;<3F$<+tgMSiC)#HTQ5L66aL=#80<|VSS8ye`sAGejm3FeGp2-O^2mk z7`QoLS48{Ro1OX?bWPrC)07@Oq?X@$Uuh3-IQgmVC%xbwRqD^@{dWT)u}VAG@opIG zR-K*NzcUhgEd;omj>p1x?ah%kgYnRAb8_MPLn{1Ws<+TPPlHv1P5)FAlHm9#S;<5- z3G@{EvbA=Hz_8I<*{*9k&>F_!D4zVa?Qmxi*M^6xxbWSzqgq52Q?&2rOl@Y+SvJnD z_>>w_4ZZ#zcrB`e3-`;iI-iRHxpBKOtBo9FKK9(Q|E(enPf~ocE~$b7%VlN9Y*pBO zs=%_2rT}A)daw5y{nh(=)N-N^(_wug zt=tReba&oxu0M`+`OyEK(Fv%o=<`XwlmJIAaeNwBb%68&%uY!0f{!hFReg4W@XdH} z{L@kxxE#E-_DdrYYFow^soP>9DtZ0Rt+(+&s-EX3#ZV!X{dGafaVia#mH*l~*e8LS z-Q=I|`ZRF5xxGK08~_#zQqRjj>q7ir7%rJz(EHE+TG>xS1&r2Nj&r%Bf;qIY=Pea# zSj){awC}bmZjbGsz7`;lmG_1PG+pxtcNmO_}wf1 z>#+i;FBkU+Q)Luj_?&@|@!Cf0GC6dS`Jw@?X{01HEBN4BpN59t0f{(TxZra05e+|` zd^2qz$8f#hINmx%rQ(a=0AcUHaoEz?_orMU7ELR+dHV82BK=gU*EiWPWSSEDuy!L5 z6Kc!v?9%YU%Lj*e_Dmhat6Z{Ot-SiUtQ9yzoov*DRX4t|5HCkqH(NIMyX*yP;fi)M zWdV?IA-FQ&MHsB!^g6F(7YRRS@3hzF#z4-Xp$N~7c*qbPQXv#zb(Vtt( zzltXT`|jZbBf5z|zpSehw%Hes82$^dRMm%Ty;2*3yZ`9%_4xl+-6o3uUCC7LehpP* zOB|4iy{v>yQbOu7Ysz>^V&P*#r4-5urQhLmpFZJ{B1J$2gl@-hXP#%Ai@i^U^%{e`V|BC$B8_>Wcgbp*-=e)@biF$_m` zI0Yvf1>yBhj(`7>y>R&ZkRnCF9@)m+gF)R8dxfD~GC&iYpG)xUH?#)_uFZqdI$l73 zJgq!COGi~%_!oXv~^ofUVB(U|ZEF9yG0V*{)k9vs_KDDtiD({k|!jAh-e7=-SAM%&Iso!N^|d)qRHW%QBdevSH#cu#CmW#QG^6pw3m zOPPE3({QOs-u}&}g*58O2Gz|G-ka}UanOTwruDmD?i>eSncFqf z6<$DQFG~Jd7XUABjjz6$4u$=KZ%4jsMS_L~b)?2T23~kH9OJf%1F1ugpf=CdX0S8 zl`)bh$10C!7hCSKyxxa;4Tn!Uv`FBP@;S$Q=Y^3LMZc%4zXSHIzV*JYvl|X}jo;2p zmV_tm>oz+<2-aWqKl95IhwL{t6pN)FdYb$fwzTgQ#7Xf=k@jN-*w5nKzW^( z2T)UD@#?bB!t%33EMLF2XH6&({WYqOq`ghRYLA>d(--3KYv8H(Y6;QkpF3D2ZySNP z#zHfl4~8MLU$#-Gt3NLEYFU&CdSZZb+Qv(&jH6Q_T(~l!iO=mfSXFr%K`XOk)ng@l z_(NJ3{KG~B_qVwb>VE>EQA9eQIy4&!_6zVtVI~5eU!45%qA>;n@5r-X+!F_JOLi=W z%NQxP;gT0~!HG~fJbo_mbRv`ul%^)=kRjwnZ>X=6J7jAb`Coiw3~xoh9v`Pk0>`L@ zYaa+;e&774v5Xk53D4!)kIUeh`-)dP5A)&6yL(53V+HUaNsrppwLaE19!X>yYS<3@ zxzh{eXGG!38EG%MoqSNX@m<(xu@FSex3-hhw&0CuQDIkgQT#5u+CK2n2wTeXmHypw zL6)zlO(uP#@gPB~`-^EJ8f*RAdMkvA&rRQpy7nhv_e$H%ycRO{_cM#tEJdMsQ*XRH zDFT^XsejDBQbMtb%r%wu#}`}mDzb$pJ#cB?9m$!>W7xtS+mS6Sk0-Y2{On6HgDJtc z#*qXEXtOn3`^4CUcnW_wfBPH+A&QRIhpxB=6apzWUXhOpEAFKK;` z0(556GK2dC@yivt7#_dfI64RE$5KTxfhBQAuren;+vva~qt1a$dPi1`zHY~JCbb!> z+ADhhu4ZT#e&qm_LAWVcE&$7a>vYCT*#+>e`>Z;Zv9-RYgdU}0jIpFk+Cl=xCd}17KTbvt*NW!p#Aw{* zEqKxWzX*)lRn8Xu}7m8SY{c&(-fG)eU7p6(j0^4hj^~Um~gts|HQNb z2tCC4OE->F!Ee51Xi3}?;x?QiD{==y#)ik|-~I^&@ouqIAyPPaz7~=Sxf%`oS~$W? zzmZ{!yU>yK#&|H{EjUsyLWMiYTIH%wDUh)_(*KQSB>c(!EX%jW6`Y#xJ(}HQ2=p|e zQCUaR@fYeI={(1c zE8llpBm_RwV>`l1Omtb&V_ZXvA|CR*Ai515VhJ@?_X^YXmNBa*8|mH-W)|Wo7Q$RD86h<{$5IMihhPigYR!;!tsd z`==Tu7EQQ{pY|F?p<^)jS4meAZhK{}{JQoe?y9q5W?At=KAx}KhOO?H>RdC;$8U!g z+!wNbZ&O82j@wMPV@)9J(OvfSH}+s)!?v|_&I|Ulp4OJJ42G$wksSqB!(jhFY>(xZ zNN5}T)Y_yH3qHo~Ce#y*mifaf>YANS0!#{fin$OHKv?>9z^+#$`1zH+U6bkrrYn^$ ztv?OGU|S)|?34svg<`E&5=`K@ZJwj(#7-<9|F1?UeYkBkjr<%PXn@8a;+O}_6XL<$z3YNB+u#bbD# z;hf_KGUhvfG0XZKgAdBf%oIOFpr+jg%}FB?Cd*uOH`5M9mI9A7Ry9A=G$;&KIPZ=Z znyC%7Z5H-;BeW<-?l=K;CX-GZ<{CpvN9OYD5C`C!aDRC)#S?CI^Zz!W&%ZuztBtOzHl*Gd~4`o&=vYlZ5$il2IBS8b#B{@egTM*Mk}Q9s%e zezHo6?SHN(`fca`0+ zCVxyQ1~)fvxmmg|3YB+RHFyS-ur-$_$~!+0ld5KgdXs&yrr|;N77I75czrv5LevyJ zcb&c~_e2VkybIbh(E{>$8;K`YoPhSvx_e627dGzycXbj%Ao_ZlQ{k~N*t|nHepg=v z1ZSp70*s`1))I$h~b9*OagN#aGKZ~$iRxG;Ens|T7zZ``-B|0uqj!vhf8HU&qkucb>EC!i7;eX zS+7G_e34SJ)V3$u4O8=$=iG~r;;!q3=OQh+fV%x4G0gl3%xp;r^}fM4WY}GB6Sne# zepBkB(+^k!;Dfe$al*$y@Z){o_Im0h%muuobQOfd`kLX7JflcRJE3;-JtL~|WXRUk z?shb2_P{2Vh;V2=F_3GmNra-inH`!uhruj{H=Cbb0rGp-rth_K!6|+I(d}$}`0ufK zaL5}TOiI{zB}8KnzEYI`^1N^p4pYbUOUg!9^*$Vnem5^8gc8jCtQmg=&{2)QihM(o z5ykY;)V-vL?h(a9cgHnxk&m~m>7W5_{*>c5(|Z`3lOt?H7#*_q8M9P+fe(g1czNkc zWCT8nHsa&mnTVcB-*rU)OU4sg_GK!gDac!1+1q(96;+6qJqOv+k?PPND7#HL1GQ-v zW2~R2V~Us-;cZbmUVYX@cYdFaX=%3ZQcG!!i{^%nOpJg+M}7D$FBLjkDE-)&hDn&D z7{<(^pNQ`pDZ6jV#-hRzCb@uZ;kczR;+Bp$Lwr8mq4~-EI2MfL{jHcGz&;}`CgQL& zX!-{3dUwW$3R`vNZcj83A-T?uDo|+!dyWo|wkS#f=kWB+Z(oG*4(Y#;=sH2npWzyA z?%RWQ*Y>t-a^b;H-#rIE{rJ!pycVOYg)fC4s}+a9Z(bo6MRuU4c0|;O_a?BG zu8{jG&jlZ31qt^U%4fs;zNR-1Mc`(vO8kN*RT9#_{HqbavZ&W2TJkM`tcrH2W$yAE zHb_1@Se>@d7e|HUrWsE&aG%+~6L#McFs${$RKiRm)(NYO*+erc@a?6|N;4hVXWZTi z@}^>v?o&?#{xnqEzH*u}pN@_GTe34ZWupI|$3)q@42)1|{Mhf4jugMkTWhE{Qt^RP zYtEaKbc~leyzg)h4b5B5f0AW5mjhSqe%w3|ibN@v9O*soNU)V>j{a{O%p^a)NvZLK ztNXPn9-SdTIMdU{#t0ldSr=@za0mlBAv8VwyElw--kLD4bA}wprz%$AcA$n!p?j?k z1OJ5iAFW#^;IpzJni@9?P}`b2{^;gGuw%;BsbA25??Th}KRaoFPx;Vx@((Gzvg3>@ z%Y&_WO)4^HXrl-t3qmvsH&X|doeq;b#0_ETn%SLHCTn;xXls1Y+aA36{OTj}+~Jqw zw{1OzUa*<=D|n#R3%<>)6~FFufkH`F-e7K5nAHD8J*qR#h~0eo+d6xKk@=J*8yu@7 z!`09psp2pi{5jWJ(7`aFMGb?``|eByLTHuHnYMJ0w2BRXqLT$aYbWcZTTX$8SEKZa zplql!Vx-JOa)6(0uplBe2b4yK$0VQUfWv3`M#ZqRAiC;ZByi>|I8a6#sC_x0Y(V`L zU(J&P$$Oj*IVfa9-u7I+_=Zd%{`~!N+pQEhCpXupFhzlTH^!$q4@SbbEmf~2Zv;aF zo1BVNvKOeOZmD^_@dV^=*FU6QZw*-EAbvd41cq0sj>n1&2er zNZ3;BB>jef*-jto){+VMj!HeTOWlirx8vXcGdQb_&Jjkf zaizHsEkLnIx^@n0eeYADR3MY-QKuR3kYg!@ zK$S7}l;pYyM}Hf{6$s~nkKU1W1?C)ZKbC7{XOISV-#l^!Tqpo7fiqcKNN|QQbnLDy z9-yMXKlAOSe>%K>PT(of)}<(AJ@?R$LfO0<{i9tc0^okSg>r<4Z`e=_1hjXvZOyu zk{_OCi@_z$!k@~Fb^Lg!!w$_9hTHA$b+K5sHWfAZrjtGsGjQE~KT(Cj(?3Hlhm9CK zE{XtPVes6sSO3JBKV)2YIB2u+MF>OrJXgNPa87vpm>(@D`hdZ)!u$-oaFBWXuaC1l z1}Ngeyx%6{z$l(2gmWey{>vKWe)XFS$qNm+*7Xtaa;ZULoVv#!+C9qyH?p6Ab~edl zS{xdXXwKL1@v${_8TI#QA8^C1LwYmxjovu9%l^M}Y$2H9^ozF5lZ399b_&;jCSj2$ z;l&5ZF!VX}D)uXX5c11=1Vr|DV{*^q!w1S;aZ9$=<2W}vYQZR@pfPdi5T;7HxqJBQ45@ME2Zfl$gxwe?97s;q}VQK0G1#XKu(N zxjqIL;|OQIDkP(uUc8c%*eUd!xGo~Pb`DPnmBr^Cxs3N0PMxTET#V1UPh^d3F2_pd z^u}J18>oEn>U(poTD-X8GdQrb9wV1Ssb3xxHeh^@HZ%@4qKK+JRsX*x6#ZH;zA3XA zCEWhDao=yoCzIVb$I_Znt$bR%??)51GISa}7AP}bgth1Qw71C@;P|D=jz2beSSC)rce`2a435Uk zxy+QOVr`0qW`Q9Uue)qnP0?Te1S8d(9JzB z0O0lrkkL0R3pzPwk!n)VO4BekzfMH&6R%oE6H!!u$Jm z9eOU}EW!G&ic2xRvZZnkER>^-ojms1aRXW{1ZfZN`7X zT$`IqTTnj1Oy5WBHg+VPQuw#ff@c$Nr`+ObLF%#Y_K0csCUo(>U3BkzJ@O;DpT(^f z^NO3Uo;X{H5!>GrzQvSaZix!Vea=gmy-J!;<;%uY?i9VP#R>SQc@j5!dtwC77e4(i zOW2&Z?aNxGFANTyyO(4U3X6gadBa%SKco|9CXa*x+XY(}xeFmM?woIQ#mS%Xh_v_3 z*47v=kls+0YLx2)>@L2{2V{A%w;pAsKX^vuYk`J_VB7ti|M}&Vh+4(mN~*V7Uh(9@H1X858EaHon)PaBt%6dYMW% z`8@p?m9HA^=+G0VhpBbIId-RIy{H}{pS}9XA=3ylOC!7f@8G(_seIpfNy!Q&N$N{O zQYlfi+@!LKQW7FNvt%ndB2fq(TlNl-opsxL9Ghf2_TD4;o$L44^Iq@s{PAAzv+n!; zjH3@TfWt0YM~yxc-W!f~-1g4|pY3F)Y0fN=S<%xMx6c9zjj%rr$yu-?L*UiX?^)o? zWN>2iLl#^~+@O9fmj%)-*T0;Z%7lh~POan2nb2`@c}1o=9ex}acKLEG4QAry6Q~Kw zsX)8CS#L%l31}x6$8)sez;bxt@{#OFaF`k~R{i@8KF+TCF;)12@qWSI_XwU)=lv)| zc*Yr~1zFb{7~jD=nxb=iILzP(XOzR5H$^dRA7y8#_!y$hyhlV-RdCMW68#@4ZM562 z8m;Tr#Tz{;a;B$s@X=hm6yZSc6C^S3sv1m}g~BU-&KBC{uu)4oOuOw2y1xINai*|z zJ$CJ$GxQ3Bn-3*af82|M!|&%A=Hyf1g@(_<#CRqc@IRzpzLO7jO6oVwz7)fWV+|I3 zt7Y(5$^0iXM-_ND`dq%H12ERG6C{1}8exZ4*y@4SM!3Hb z!&ArA2)jimJ3}MuL3ClaoSt1R$a40THTGA**JJU-4}Il8TWo1^+_wbsbeSa^{tn#l?%2eqN5BCOQ$IgrKQPYc+?pnTf{v&BPbO4t z;mubpjyfu%i<$5#b}w}-*)uY>VXA`Xc9b3Mq1M2)b9!x^ zT&BqXa9=v}_8VMl95tb{{($ta6x%pP-BHr+$Sy5S%1Ic1gz8aU0Nyf5AR6n0qR?KZ zCv^=Glv1vNMBqpi+I*5TU15zuGU051=bf%-6i=wojjoHtqk95`ee6Q;%4Erjqs!jt z#n(^HH8R6j@Wr)QJC?+pO1l}o2pUq`}Qy`8cAOmT35=qvZwApy`dU4Q#` z0`Th(+1nqDhxI>6_qBPWf#a4(_ixfS&^KQyI4|J|;R|M)-~6#cgvmtvxjvAnA;IEM{m5FA$Ms&CTS2Y#91n&3z zf6dewl6n|dBbRJ}_4cSIw+Tf#St_I{()I~7EXKaF^Lc_!@ep~}ninj8j(Yu?)eF@2 ze@(e-y?-Pn~30a5*M&QKE%dSv0%4MR-AG(|L3qLz07`!kn z!Ms)DlFqkPc;}_Ut&WTal%S`?=1R?!5B7*W#@L3uox55>?suR*ef>d$kxm@pEY!M{ z)`dsh&AAUJbfFLDf9a%(PCPry_A!H_1KmE15IS-zTTzj;yTX1)Gp6*(h{1S0vI>kW z<^@+`D#kL%Iu>J@R^IaD{v6a8P~F{IoQwj_yH0I7gkqOJk3p2YBR=p;P4D8gghrf5 zq)TuEANwSC&kAp-NIqsTuJILUw}-OcP~!bh6DQ1Z)(wP?cN<>6L^PUMmr@KwhUbz!3gvL&#NV)6JykIH(E4lyZyhu^rML{T zG(yZ}Qb2PFiBkWkXYDgDXoA@ToSV|-&7k(|J;k!18IJo4d^&up1+L?f96R|II8DV( zyk^t_$>mj^u2wDZtla0_jaMxoQz9z&RJjG3T++{dINt)TiP6To_05n;&l}yt(+o9k z$A85LH9_DveB!>=2=V;nLv!18p!4+k6s~nMt}X37Ye(#=)asjOoZ2g!m?#4&Y*VoSEQ7e zC7gQo{{7iDb3EBL@XKZAJM^0PXO{TW0n2>Yn8nC;6uPN3!SAgOMjYk5G#q9}(f@59 z6@MDy4>B4_PI+SS@bCQEr>({eAXc*LdTN3c)A{#-~3(s{`QU~nLvs+J9eL5rqkVaE5sEB8-sgm7F5tz6h5rLNoJgql)S&?|dP z+V*=xE_s7Hy3fq*SbU52Vh@hSliV<9@D_6)l|Np*dRLc5J^~99{O(mr$K#`faF5^v zsW`e6nZtHA1LIlcX;k*)px8|Sf37?fJs;k9eKH?YpI=}&*q)E)|K0TqeUXcMLZU|q z49_w#D_U%ylT{+7N*qjXc0}^(oI77XqgR!;5ULGSDf^ z*>|F<5*Ft2AH-x-!*GlL7uB+A5VVXo64+A(>}g7B3a(}FA=hR(LI)vMZA7c_ZU)8L zA%}wGL_olpGSTy(6MDfO!alNkJpOf=5GGHh<3-`)2YGmZp@#oFYbPlWQ@o6|KAp(N zt-VA&|Bt!YF6iFK;E|15H3hGgx5|F4jKf3E$*M)!mz8|~_x zd&aB!F_(Bo-KJ;=v*}h=uEdXE;EITvt>pnYM%U9#}bXGvjTrm>fSe_-Fp~=AsLgV9ai^;g^xv-r&5st!1YYW9MypVbS z<&FwFYjm|x5{&(R4Zr;yy>eXWBdB}{X3n6cD7Y%$DPPqMhNQP;QqL12phhP#_ikJ? zw7oX;xAltw=U%$#^dnI)Y%FSXNZ%idR3%skDdd!9S+R7D!Z-rn+PQg-OqGXTA9NSS z9#x`vcMdK@HDR1e6UTuMotV!xecx!K5B2&@N>4QnA@OZTQiJ0tit?ML?P{OE&oa&; z`ow9hW8$Y*^u zRG1t)Gol%bV^2a){C%K_LB%#d_w5J-vrF`yCo`kq%FKm(_H?mOvD9AvYC9SPwZ|nI zCL&;FX2+&{Q80KIP3tq?Awq|~Qg$Yx*A|w1iCkA`Jt5NTXU})h5IEd$L_;|$9^R}_ zahhLEhw+B>xZ1z{g00O48Uu?jMMFsgRpQuxsy`n=z&)aRvu^caDT*T%B|*v>`~2M$yL{jxV2xPK5h*@ zGBJ{lZm&RD??+>W=_T-twC~mNSOC_TFITm;W+B=3zF&9A1O(`iixhv5;fGg#gzsEG zT#Y^TdWUWo+>ByT&f{!`!uuPpRJSPTM3m`Mwe|uqGM6f)VNQjWD||Bd?SsISx_0)1 zsXp8hIk&ToLUbflCBM<(t#HG5nyQR{exLC%`QeNtt0&UcOawjB4?(Wb27_g-I5ZTp z+RWVb6Xj$s6lz|k6cQfnKU6zgfM(V)xyU!`9QjUqDIEC z8h#sE^=@r7!N(jpqifL)g8VZ&;(1-*I8ea&!n7MYM=EKaS$0FuPCmAp(k|%ab8+|i z*bcKslQSLM&2YZvtpj&!HT-kxU+0V~0K1Ut_>Iruz%RrfE!P->r<4zuh^7;AF;G(} z%YwcXXPcg0?zOH!H?o99W>zKIw6Jcaa8=?ty6fkuPL!ji@cc%DXbCbeq(qat3Q&Z} zdn1V_8~cX`G$%JwacjrDLR!^0q!%VFwP}Z7{DPO-!Y&bD7cOxCGBff0(nUZU3Lk0QQ{{Pke z!ERFI2%YR89_Ey*mA4zgDgPjFhp?X4SQ z@5@#(%78g2AaR9aELeQvmiRJisnRvy{IZA)0@GBR;q$oLp#4%N+bojGAJ=wxjCHliZQC-{?e4HY2#DC{~!kQQU%et{_`pXh|TeM204t@nf?ZG1Rgb*-L;`M&; zIU3vw9!0lUBtYotm0X@zNf7?~WzvD+L@;gEP~n=00U`3oJ+lK|pv7}^tN&04`km|R zzwtN+pD?S3TK%fTq#lAChcRCZ@_nsMOdsgR^9c(yrrm=m$lIWQBK`i3uGy=L7Z>kV>G7b_!7(_yRS&F4KxeX3l)r-;E4O&;rzB5@cemV zeP?4SoO%=)cx5yXXkN(9I#K@2M56hFXHs$CO1z=?AHl0~Q<~ac56}Ar+`ynOkhN$3 z&SgsPh{YP2IVc+x}X}a+=sN_CaO17h8qGFvuu! zzLO3bhk2WUQxp8Nz<3l0tQ|F(P;`@nW zdGXqU`;{O|;nf2=flQcwX=s(RSq^5z#4eFi5@_cMMoXP)hpU%$z1fm~L%5X2cirV4 zFmv=wS^3ci6+RYYtjT=}K$22mD_rUU{f2wB`jmkV<==>8o^~iOev-UhNrKzPC-?1U zr~td(P!w0ngd(c9j(;UJDTyu9n_d4>kw~nyH`OVmsblRL%Q&?4#^0l6iGtx+sku)sq8Uph|4`)u8W&e5 z(CE!$+6!B)nXp+pyhS6Pu5xM$hlu%4JCuL8(>f%tkDZ9T1P+i0i})DZX|!AyuZur1s^Hn)E@?oMoZ73!{7cc zs_iM1NuN^gFd}3>ulzJ3;HZrB8OFOQY5J*fj$ z+7#8WGYGjJtGu}F2WLi{cpmg>;jut7u_?na0yv)b<(P7*1al9LJaM{C7?0a!zbY^Q zRR`0wxqXJ=O7w6%lKL*{3Wkz2&qg>RV4!aFtYSn3aJ_>($Lqn4Ic(45`yar$GoRPhUkhAIQvb;}go7TtZRz>k8hG0O zZtZ?tH?;G*3r{T%fns2Ds-WZ;@E(#-h;yHSYm1zZhVmz%_DQ_Vt)Vga_@8UU=VO$r zMa}Wu)4F{S*{m+bL$Gg#dh2yA6Y*S_QQW^ND&+^%!JRSUb-u9U?92F)tQhb-a9zc! zH4C&JY`zr?&V}Ho{uk#~(?D6Sr>o5-3|yE`=LFOFKz8x(_nJll;8xyZyK0;UzFe#4 z(&I|tT#{8=c78SR2(U|^p{@g|`^inmM5_V9xDpAXekG6}%(SF*C=0c}H8>YmK?o1=i!$5dWbnWdK1je4}pc77i;IvXm ze?>wyg^Aa~pE;)u!J}k*8vOx&{pZYg+mFEIrB;o_~{8l{f{VN}3P;{5K2j z+4Yi3`{p2SjE5ADvyit<623M%4e`qF+F1oAz<2qs^1bRI80%(y%01c)7pu?2ZfHAV z>0{ms_WXL#$>*KY_8Neg_QU#Yv7=CT=fZ$9X#$E_a-@uNf1pKB&S3d=J!>_n83Ra#0?h0a!QHE|%5(xP&FX|!wXb!3| zTyq#(CN<~#-&b5PvbrnD;DO$kx}WE;1Yp2ja~TFx7koBOHMV~|8ul!uhNLFtfq}Qo V^fCD|5E*AVu)DwF|7*~a|9=^u2VejI From 5c83d9df544c31132686ef300cf429b019c62de2 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Thu, 4 Apr 2024 10:18:33 +0200 Subject: [PATCH 06/54] particle exchange GPU --- src/MovWindow/SimWindow.cpp | 11 +- src/Particles/Particles.h | 2 +- src/Particles/nvidiaParticles.cu | 171 +++++++++++++--------- src/Particles/nvidiaParticles.h | 10 +- src/Patch/Patch.cpp | 5 +- src/Patch/VectorPatch.cpp | 74 +--------- src/SmileiMPI/AsyncMPIbuffers.cpp | 31 ++-- src/SmileiMPI/AsyncMPIbuffers.h | 2 +- src/SmileiMPI/SmileiMPI.cpp | 3 +- src/Species/Species.cpp | 92 ++++++------ src/Species/Species.h | 6 +- src/Species/SpeciesV.cpp | 6 +- src/Species/SpeciesV.h | 2 +- src/Species/SpeciesVAdaptive.cpp | 2 +- src/Species/SpeciesVAdaptiveMixedSort.cpp | 2 +- 15 files changed, 189 insertions(+), 230 deletions(-) diff --git a/src/MovWindow/SimWindow.cpp b/src/MovWindow/SimWindow.cpp index 08ffada69..6dbb5da57 100755 --- a/src/MovWindow/SimWindow.cpp +++ b/src/MovWindow/SimWindow.cpp @@ -384,14 +384,9 @@ void SimWindow::shift( VectorPatch &vecPatches, SmileiMPI *smpi, Params ¶ms, } // end loop nSpecies #if defined ( SMILEI_ACCELERATOR_MODE ) - if ( params.gpu_computing ) { - // ADD NEW PARTS ON GPU - for( unsigned int ispec=0 ; ispecvecSpecies[ispec]->particles_to_move->clear(); - // mypatch->vecSpecies[ispec]->particles->copyParticles( 0, mypatch->vecSpecies[ispec]->getNbrOfParticles(), - // *mypatch->vecSpecies[ispec]->particles_to_move, 0 ); - mypatch->vecSpecies[ispec]->particles->initializeDataOnDevice(); - mypatch->vecSpecies[ispec]->particles_to_move->initializeDataOnDevice(); + if( params.gpu_computing ) { + for( auto spec: mypatch->vecSpecies ) { + spec->allocateParticlesOnDevice(); } } #endif diff --git a/src/Particles/Particles.h b/src/Particles/Particles.h index a155baf7a..13941b40a 100755 --- a/src/Particles/Particles.h +++ b/src/Particles/Particles.h @@ -476,7 +476,7 @@ class Particles //! Extract particles escaping the box to buffers // ----------------------------------------------------------------------------- virtual void extractParticles( const size_t ndim, const bool copy[], Particles* buffer[] ); - +virtual void extractParticles( Particles* particles_to_move ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device // ----------------------------------------------------------------------------- diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index d7a63f0b3..191e0943f 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -33,14 +33,24 @@ // Cell key manipulation functor definition //////////////////////////////////////////////////////////////////////////////// -//! Structure with specific function count_if_out for thrust::tuple operator -//! Return True if the entry is -1 as in the cell keys vector for instance -struct count_if_out +//! Predicate for cell_keys +//! Return True if the entry is equal to `code` +template +struct cellKeyEquals { constexpr __host__ __device__ bool operator()( const int& x ) const { - return x == -1; + return x == code; + } +}; + +struct cellKeyNegative +{ + constexpr __host__ __device__ bool + operator()( const int& x ) const + { + return x < 0; } }; @@ -250,7 +260,7 @@ namespace detail { }; - //! This functor assign a cluster key to a_particle. + //! This functor checks the cluster key of a_particle. //! template struct OutOfClusterPredicate @@ -286,7 +296,7 @@ namespace detail { __host__ __device__ bool operator()( const Tuple& a_particle ) const { - return thrust::get<0>( a_particle ) /* cluster key */ == -1; + return thrust::get<0>( a_particle ) /* cluster key */ < 0; } }; @@ -467,34 +477,34 @@ namespace detail { // - compute bins // NOTE: This method consumes a lot of memory ! O(N) - const auto new_particle_to_inject_count = particle_to_inject.deviceSize(); - const auto current_local_particles_count = std::distance( first_particle, last_particle ); - const auto new_particle_count = new_particle_to_inject_count + current_local_particles_count; + const auto initial_count = std::distance( first_particle, last_particle ); + const auto inject_count = particle_to_inject.deviceSize(); + const auto new_count = initial_count + inject_count; // NOTE: We really want a non-initializing vector here! // It's possible to give a custom allocator to thrust::device_vector. // Create one with construct(<>) as a noop and derive from // thrust::device_malloc_allocator. For now we do an explicit resize. - particle_to_inject.softReserve( new_particle_count ); - particle_to_inject.resize( new_particle_count ); // We probably invalidated the iterators + particle_to_inject.softReserve( new_count ); + particle_to_inject.resize( new_count ); // We probably invalidated the iterators // Copy out of cluster/tile/chunk particles // partition_copy is way slower than copy_if/remove_copy_if on rocthrust // https://github.com/ROCmSoftwarePlatform/rocThrust/issues/247 - const auto first_particle_to_inject = particle_iterator_provider( particle_to_inject ); + const auto first_to_inject = particle_iterator_provider( particle_to_inject ); + const auto first_to_reorder = first_to_inject + inject_count; // NOTE: copy_if/remove_copy_if are stable. - const auto partitioned_particles_bounds_true = thrust::copy_if( thrust::device, + // First, copy particles that are not in their own cluster anymore + const auto first_already_ordered = thrust::copy_if( thrust::device, first_particle, last_particle, - // Dont overwrite the particle_to_inject (at the start of the array) - first_particle_to_inject + new_particle_to_inject_count, + first_to_reorder, OutOfClusterPredicate{ cluster_type } ); - const auto partitioned_particles_bounds_false = thrust::remove_copy_if( thrust::device, + // Then, copy particles that are still in their own cluster + const auto end = thrust::remove_copy_if( thrust::device, first_particle, last_particle, - // Do the copy with a destination - // starting from partitioned_particles_bounds_true - partitioned_particles_bounds_true, + first_already_ordered, OutOfClusterPredicate{ cluster_type } ); // Compute or recompute the cluster index of the particle_to_inject @@ -502,23 +512,23 @@ namespace detail { // - we can "save" some work here if cluster index is already computed // for the new particles to inject (not the one we got with copy_if). // - doComputeParticleClusterKey( first_particle_to_inject, - partitioned_particles_bounds_true, + doComputeParticleClusterKey( first_to_inject, + first_already_ordered, cluster_type ); - const auto first_particle_to_inject_no_key = particle_no_key_iterator_provider( particle_to_inject ); - const auto particle_to_rekey_count = std::distance( first_particle_to_inject, - partitioned_particles_bounds_true ); + const auto first_to_inject_no_key = particle_no_key_iterator_provider( particle_to_inject ); + const auto particle_to_rekey_count = std::distance( first_to_inject, + first_already_ordered ); doSortParticleByKey( particle_to_inject.getPtrCellKeys(), particle_to_inject.getPtrCellKeys() + particle_to_rekey_count, - first_particle_to_inject_no_key ); + first_to_inject_no_key ); // This free generates a lot of memory fragmentation. // particle_container.free(); // Same as for particle_to_inject, non-initializing vector is best. - particle_container.softReserve( new_particle_count ); - particle_container.resize( new_particle_count ); + particle_container.softReserve( new_count ); + particle_container.resize( new_count ); // Merge by key // NOTE: Dont merge in place on GPU. That means we need an other large buffer! @@ -527,9 +537,9 @@ namespace detail { particle_to_inject.getPtrCellKeys(), // Input range 1, first key particle_to_inject.getPtrCellKeys() + particle_to_rekey_count, // Input range 1, last key particle_to_inject.getPtrCellKeys() + particle_to_rekey_count, // Input range 2, first key - particle_to_inject.getPtrCellKeys() + new_particle_count, // Input range 2, last key - first_particle_to_inject_no_key, // Input range 1, first value - first_particle_to_inject_no_key + particle_to_rekey_count, // Input range 2, first value + particle_to_inject.getPtrCellKeys() + new_count, // Input range 2, last key + first_to_inject_no_key, // Input range 1, first value + first_to_inject_no_key + particle_to_rekey_count, // Input range 2, first value particle_container.getPtrCellKeys(), // Output range first key particle_no_key_iterator_provider( particle_container ) ); // Output range first value @@ -1365,38 +1375,61 @@ unsigned int nvidiaParticles::deviceCapacity() const } // ----------------------------------------------------------------------------- -//! Extract particles from the Particles object and put -//! them in the Particles object `particles_to_move` +//! Move escaping particles to the buffers // ----------------------------------------------------------------------------- -void nvidiaParticles::extractParticles( Particles* particles_to_move ) +void nvidiaParticles::extractParticles( const size_t ndim, const bool copy[], Particles* buffer[] ) +{ + // Escaping particles have a cell_key equal to -2-direction + // where direction goes from 0 to 6 and tells which way the particle escapes. + // If the cell_key is -1, the particle must be destroyed so it is not extracted. + + extractParticlesByKey<-2>( copy[0], buffer[0] ); // x_min + extractParticlesByKey<-3>( copy[1], buffer[1] ); // x_max + if( ndim > 1 ) { + extractParticlesByKey<-4>( copy[2], buffer[2] ); // y_min + extractParticlesByKey<-5>( copy[3], buffer[3] ); // y_max + if( ndim > 2 ) { + extractParticlesByKey<-6>( copy[4], buffer[4] ); // z_min + extractParticlesByKey<-7>( copy[5], buffer[5] ); // z_max + } + } +} + + +//! Copy particles which have cell_key = key +template< const int key> +void nvidiaParticles::extractParticlesByKey( bool copy, Particles* buffer ) { // TODO(Etienne M): We are doing extra work. We could use something like - // std::partition to output the invalidated particles in particles_to_move + // std::partition to output the invalidated particles in buffer // and keep the good ones. This would help us avoid the std::remove_if in // the particle injection and sorting algorithm. - - // Manage the send data structure - nvidiaParticles* const cp_parts = static_cast( particles_to_move ); - const int nparts = gpu_nparts_; - const int position_dimension_count = nvidia_position_.size(); - - const int nparts_to_move = thrust::count_if( thrust::device, - nvidia_cell_keys_.cbegin(), - nvidia_cell_keys_.cbegin() + nparts, - count_if_out() ); - - // Resize it, if too small (copy_if do not resize) - cp_parts->resize( nparts_to_move ); - + + if( ! copy ) { + return; + } + + const int nparts = gpu_nparts_; // Iterator of the main data structure // NOTE: https://nvidia.github.io/thrust/api/classes/classthrust_1_1zip__iterator.html#class-thrustzip_iterator - const auto source_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), + const auto source_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), nvidia_momentum_[0].begin(), nvidia_momentum_[1].begin(), nvidia_momentum_[2].begin(), nvidia_weight_.begin(), nvidia_charge_.begin() ) ); - const auto source_iterator_last = source_iterator_first + nparts; // std::advance + const auto source_iterator_last = source_iterator_first + nparts; // std::advance + + nvidiaParticles* const cp_parts = static_cast( buffer ); + + const int nparts_to_copy = thrust::count_if( thrust::device, + nvidia_cell_keys_.cbegin(), + nvidia_cell_keys_.cbegin() + nparts, + cellKeyEquals() ); + + // Resize it, if too small (copy_if do not resize) + cp_parts->resize( nparts_to_copy ); + const auto destination_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( cp_parts->nvidia_position_[0].begin(), cp_parts->nvidia_momentum_[0].begin(), cp_parts->nvidia_momentum_[1].begin(), @@ -1404,24 +1437,23 @@ void nvidiaParticles::extractParticles( Particles* particles_to_move ) cp_parts->nvidia_weight_.begin(), cp_parts->nvidia_charge_.begin() ) ); - // Copy send particles in dedicated data structure if nvidia_cell_keys_=0 (currently = 1 if keeped, new PartBoundCond::apply(...)) + // Copy send particles in dedicated data structure thrust::copy_if( thrust::device, source_iterator_first, source_iterator_last, - // Copy depending on count_if_out()(nvidia_cell_keys_[i]) nvidia_cell_keys_.cbegin(), destination_iterator_first, - count_if_out() ); + cellKeyEquals() ); - // Copy the other position values depending on the simulation's grid - // dimensions - for( int i = 1; i < position_dimension_count; ++i ) { + // Copy the other position values depending on the simulation's grid dimensions + const int ndim_particles = nvidia_position_.size(); + for( int i = 1; i < ndim_particles; ++i ) { thrust::copy_if( thrust::device, nvidia_position_[i].cbegin(), nvidia_position_[i].cbegin() + nparts, nvidia_cell_keys_.cbegin(), cp_parts->nvidia_position_[i].begin(), - count_if_out() ); + cellKeyEquals() ); } // Special treatment for chi if radiation emission @@ -1431,7 +1463,7 @@ void nvidiaParticles::extractParticles( Particles* particles_to_move ) nvidia_chi_.cbegin() + nparts, nvidia_cell_keys_.cbegin(), cp_parts->nvidia_chi_.begin(), - count_if_out() ); + cellKeyEquals() ); } if( has_Monte_Carlo_process ) { @@ -1440,7 +1472,7 @@ void nvidiaParticles::extractParticles( Particles* particles_to_move ) nvidia_tau_.cbegin() + nparts, nvidia_cell_keys_.cbegin(), cp_parts->nvidia_tau_.begin(), - count_if_out() ); + cellKeyEquals() ); } if( tracked ) { @@ -1449,10 +1481,10 @@ void nvidiaParticles::extractParticles( Particles* particles_to_move ) nvidia_id_.cbegin() + nparts, nvidia_cell_keys_.cbegin(), cp_parts->nvidia_id_.begin(), - count_if_out() ); + cellKeyEquals() ); } - particles_to_move->copyFromDeviceToHost(); + buffer->copyFromDeviceToHost(); } @@ -1475,7 +1507,7 @@ void nvidiaParticles::extractParticles( Particles* particles_to_move ) // std::begin( nvidia_position_[i] ), // std::begin( nvidia_position_[i] ) + nparts, // std::cbegin( nvidia_cell_keys_ ), -// count_if_out() ); +// cellKeyEquals<-1>() ); // } // //} @@ -1490,7 +1522,7 @@ int nvidiaParticles::eraseLeavingParticles() const int nparts_to_remove = thrust::count_if( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.begin() + nparts, - count_if_out() ); + cellKeyNegative() ); if( nparts_to_remove > 0 ) { @@ -1508,7 +1540,7 @@ int nvidiaParticles::eraseLeavingParticles() first_particle, last_particle, nvidia_cell_keys_.cbegin(), - count_if_out() ); + cellKeyNegative() ); // Remove the other position values depending on the simulation's grid // dimensions @@ -1517,7 +1549,7 @@ int nvidiaParticles::eraseLeavingParticles() nvidia_position_[i].begin(), nvidia_position_[i].begin() + nparts, nvidia_cell_keys_.cbegin(), - count_if_out() ); + cellKeyNegative() ); } if( has_quantum_parameter ) { @@ -1525,7 +1557,7 @@ int nvidiaParticles::eraseLeavingParticles() nvidia_chi_.begin(), nvidia_chi_.begin() + nparts, nvidia_cell_keys_.cbegin(), - count_if_out() ); + cellKeyNegative() ); } if( has_Monte_Carlo_process ) { @@ -1533,7 +1565,7 @@ int nvidiaParticles::eraseLeavingParticles() nvidia_tau_.begin(), nvidia_tau_.begin() + nparts, nvidia_cell_keys_.cbegin(), - count_if_out() ); + cellKeyNegative() ); } if( tracked ) { @@ -1541,7 +1573,7 @@ int nvidiaParticles::eraseLeavingParticles() nvidia_id_.begin(), nvidia_id_.begin() + nparts, nvidia_cell_keys_.cbegin(), - count_if_out() ); + cellKeyNegative() ); } // Update current number of particles @@ -1679,8 +1711,7 @@ void nvidiaParticles::importAndSortParticles( Particles* particles_to_inject ) int nvidiaParticles::prepareBinIndex() { if( first_index.size() == 0 ) { - // Some Particles object like particles_to_move do not have allocated - // bins, we skip theses. + // Some Particles object do not have allocated bins, we skip theses. return -1; } diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index 249a9fcf2..64164fad7 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -113,10 +113,12 @@ class nvidiaParticles : public Particles }; // ----------------------------------------------------------------------------- - //! Extract particles from the Particles object and put - //! them in the Particles object `particles_to_move` + //! Move escaping particles to the buffers // ----------------------------------------------------------------------------- - void extractParticles( Particles* particles_to_move ) override; + void extractParticles( const size_t ndim, const bool copy[], Particles* buffer[] ) override; + + template< const int key> + void extractParticlesByKey( bool copy, Particles* buffer ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device and returns the number of particle removed @@ -124,7 +126,7 @@ class nvidiaParticles : public Particles int eraseLeavingParticles() override; // ----------------------------------------------------------------------------- - //! Inject particles from particles_to_move into *this and return he number of particle added + //! Inject particles from particles_to_inject into *this and return the number of particle added // ----------------------------------------------------------------------------- int injectParticles( Particles* particles_to_inject ) override; diff --git a/src/Patch/Patch.cpp b/src/Patch/Patch.cpp index 546e0ca08..d61c1f9e1 100755 --- a/src/Patch/Patch.cpp +++ b/src/Patch/Patch.cpp @@ -784,10 +784,10 @@ void Patch::cornersParticles( int ispec, Params ¶ms, int iDim ) } - // Copy corner particles to the start or the end of the particles to be sent for the following dimension + // Copy corner particles to the end of the particles to be sent for the following dimension for( size_t otherDim = iDim+1; otherDim < (size_t) ndim; otherDim++ ) { if( indices_corner_min[otherDim-iDim-1].size() > 0 && neighbor_[otherDim][0] != MPI_PROC_NULL ) { - partRecv.copyParticles( indices_corner_min[otherDim-iDim-1], *buffer.partSend[otherDim][0], 0 ); + partRecv.copyParticles( indices_corner_min[otherDim-iDim-1], *buffer.partSend[otherDim][0], buffer.partSend[otherDim][0]->size() ); } if( indices_corner_max[otherDim-iDim-1].size() > 0 && neighbor_[otherDim][1] != MPI_PROC_NULL ) { partRecv.copyParticles( indices_corner_max[otherDim-iDim-1], *buffer.partSend[otherDim][1], buffer.partSend[otherDim][1]->size() ); @@ -1310,7 +1310,6 @@ void Patch::deleteFieldsOnDevice() // for( unsigned int ispec=0 ; ispec<( *this )( ipatch )->vecSpecies.size() ; ispec++ ) { // Species *spec = species( ipatch, ispec ); // spec->particles->initializeDataOnDevice(); -// spec->particles_to_move->initializeDataOnDevice(); // //#pragma acc enter data copyin(spec->nrj_radiation) // } diff --git a/src/Patch/VectorPatch.cpp b/src/Patch/VectorPatch.cpp index 22d976ba2..9067d049d 100755 --- a/src/Patch/VectorPatch.cpp +++ b/src/Patch/VectorPatch.cpp @@ -4671,86 +4671,22 @@ void VectorPatch::allocateDataOnDevice(Params ¶ms, RadiationTables *radiation_tables, MultiphotonBreitWheelerTables *multiphoton_Breit_Wheeler_tables) { - + #if defined( SMILEI_ACCELERATOR_MODE ) // TODO(Etienne M): FREE. If we have load balancing or other patch // creation/destruction available (which is not the case on GPU ATM), // we should be taking care of freeing this GPU memory. - const int npatches = this->size(); - - // const int sizeofJx = patches_[0]->EMfields->Jx_->size(); - // const int sizeofJy = patches_[0]->EMfields->Jy_->size(); - // const int sizeofJz = patches_[0]->EMfields->Jz_->size(); - // const int sizeofRho = patches_[0]->EMfields->rho_->size(); - - // const int sizeofEx = patches_[0]->EMfields->Ex_->size(); - // const int sizeofEy = patches_[0]->EMfields->Ey_->size(); - // const int sizeofEz = patches_[0]->EMfields->Ez_->size(); - - // const int sizeofBx = patches_[0]->EMfields->Bx_->size(); - // const int sizeofBy = patches_[0]->EMfields->By_->size(); - // const int sizeofBz = patches_[0]->EMfields->Bz_->size(); - - for( int ipatch=0 ; ipatchvecSpecies.size(); ispec++ ) { - Species *spec = species( ipatch, ispec ); - spec->particles->initializeDataOnDevice(); - spec->particles_to_move->initializeDataOnDevice(); - - // Create photon species on the device - if ( spec->radiation_model_ == "mc" && spec->photon_species_) { - spec->radiated_photons_->initializeDataOnDevice(); - } - - // Create pair species on the device - if ( spec->mBW_pair_species_[0] && spec->mBW_pair_species_[1]) { - spec->mBW_pair_particles_[0]->initializeDataOnDevice(); - spec->mBW_pair_particles_[1]->initializeDataOnDevice(); - } - - //#pragma acc enter data copyin(spec->nrj_radiation) + for( auto spec: patch->vecSpecies ) { + spec->allocateParticlesOnDevice(); } // Allocate field data structures on GPU - patches_[ipatch]->allocateFieldsOnDevice(); - - // const double *const Jx = patches_[ipatch]->EMfields->Jx_->data(); - // const double *const Jy = patches_[ipatch]->EMfields->Jy_->data(); - // const double *const Jz = patches_[ipatch]->EMfields->Jz_->data(); - // const double *const Rho = patches_[ipatch]->EMfields->rho_->data(); + patch->allocateFieldsOnDevice(); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Jx, sizeofJx ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Jy, sizeofJy ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Jz, sizeofJz ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Rho, sizeofRho ); - - // const double *const Ex = patches_[ipatch]->EMfields->Ex_->data(); - // const double *const Ey = patches_[ipatch]->EMfields->Ey_->data(); - // const double *const Ez = patches_[ipatch]->EMfields->Ez_->data(); - - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Ex, sizeofEx ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Ey, sizeofEy ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Ez, sizeofEz ); - - // const double *const Bmx = patches_[ipatch]->EMfields->Bx_m->data(); - // const double *const Bmy = patches_[ipatch]->EMfields->By_m->data(); - // const double *const Bmz = patches_[ipatch]->EMfields->Bz_m->data(); - - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Bmx, sizeofBx ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Bmy, sizeofBy ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Bmz, sizeofBz ); - - // const double *const Bx = patches_[ipatch]->EMfields->Bx_->data(); - // const double *const By = patches_[ipatch]->EMfields->By_->data(); - // const double *const Bz = patches_[ipatch]->EMfields->Bz_->data(); - - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( Bx, sizeofBx ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( By, sizeofBy ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( Bz, sizeofBz ); - } // end patch loop // TODO(Etienne M): We should create a function that does the copy of the radiation table. diff --git a/src/SmileiMPI/AsyncMPIbuffers.cpp b/src/SmileiMPI/AsyncMPIbuffers.cpp index a5a53dbb0..4cb283d17 100755 --- a/src/SmileiMPI/AsyncMPIbuffers.cpp +++ b/src/SmileiMPI/AsyncMPIbuffers.cpp @@ -1,5 +1,6 @@ #include "AsyncMPIbuffers.h" +#include "ParticlesFactory.h" #include "Field.h" #include "Patch.h" @@ -75,29 +76,29 @@ SpeciesMPIbuffers::~SpeciesMPIbuffers() } -void SpeciesMPIbuffers::allocate( unsigned int ndims ) +void SpeciesMPIbuffers::allocate( Params ¶ms, Patch *patch ) { - srequest.resize( ndims ); - rrequest.resize( ndims ); + srequest.resize( params.nDim_field ); + rrequest.resize( params.nDim_field ); - partRecv.resize( ndims ); - partSend.resize( ndims ); + partRecv.resize( params.nDim_field ); + partSend.resize( params.nDim_field ); - partSendSize.resize( ndims ); - partRecvSize.resize( ndims ); + partSendSize.resize( params.nDim_field ); + partRecvSize.resize( params.nDim_field ); - for( unsigned int i=0 ; i > partRecv; diff --git a/src/SmileiMPI/SmileiMPI.cpp b/src/SmileiMPI/SmileiMPI.cpp index c35a69fe9..4fe93fd03 100755 --- a/src/SmileiMPI/SmileiMPI.cpp +++ b/src/SmileiMPI/SmileiMPI.cpp @@ -929,8 +929,7 @@ void SmileiMPI::recv_species( Patch *patch, int from, int &tag, Params ¶ms ) recv( patch->vecSpecies[ispec]->particles, from, tag+2*ispec, recvParts ); MPI_Type_free( &( recvParts ) ); } - patch->vecSpecies[ispec]->particles->initializeDataOnDevice(); - patch->vecSpecies[ispec]->particles_to_move->initializeDataOnDevice(); + patch->vecSpecies[ispec]->allocateParticlesOnDevice(); } diff --git a/src/Species/Species.cpp b/src/Species/Species.cpp index 0fb38f673..bfc1ae036 100755 --- a/src/Species/Species.cpp +++ b/src/Species/Species.cpp @@ -90,7 +90,6 @@ Species::Species( Params ¶ms, Patch *patch ) : { // &particles_sorted[0] particles = ParticlesFactory::create( params, *patch ); - particles_to_move = ParticlesFactory::create( params, *patch ); regular_number_array_.clear(); partBoundCond = NULL; @@ -104,7 +103,7 @@ Species::Species( Params ¶ms, Patch *patch ) : dx_inv_[1] = 1./cell_length[1]; dx_inv_[2] = 1./cell_length[2]; - initCluster( params ); + initCluster( params, patch ); inv_nDim_particles = 1./( ( double )nDim_particle ); length_[0]=0; @@ -123,7 +122,7 @@ Species::Species( Params ¶ms, Patch *patch ) : }//END Species creator -void Species::initCluster( Params ¶ms ) +void Species::initCluster( Params ¶ms, Patch *patch ) { // NOTE: On GPU we dont use first_index, it would contain redundant data but // we are forced to initialize it due to ParticleCreator::create() and the @@ -252,7 +251,7 @@ void Species::initCluster( Params ¶ms ) #endif //Initialize specMPI - MPI_buffer_.allocate( nDim_field ); + MPI_buffer_.allocate( params, patch ); //ener_tot = 0.; nrj_bc_lost = 0.; @@ -386,7 +385,6 @@ void Species::initOperators( Params ¶ms, Patch *patch ) typePartRecv.resize( nDim_field*2, MPI_DATATYPE_NULL ); exchangePatch = MPI_DATATYPE_NULL; - particles_to_move->initialize( 0, *particles ); } @@ -396,7 +394,6 @@ void Species::initOperators( Params ¶ms, Patch *patch ) Species::~Species() { delete particles; - delete particles_to_move; delete Push; delete Interp; @@ -631,6 +628,34 @@ Species::deleteSpeciesCurrentAndChargeOnDevice( } } + +void Species::allocateParticlesOnDevice() +{ + particles->initializeDataOnDevice(); + for( auto partSends: MPI_buffer_.partSend ) { + for( auto partSend: partSends ) { + partSend->initializeDataOnDevice(); + } + } + for( auto partRecvs: MPI_buffer_.partRecv ) { + for( auto partRecv: partRecvs ) { + partRecv->initializeDataOnDevice(); + } + } + + // Create photon species on the device + if( radiation_model_ == "mc" && photon_species_ ) { + radiated_photons_->initializeDataOnDevice(); + } + + // Create pair species on the device + if( mBW_pair_species_[0] && mBW_pair_species_[1] ) { + mBW_pair_particles_[0]->initializeDataOnDevice(); + mBW_pair_particles_[1]->initializeDataOnDevice(); + } +} + + //! Copy particles from host to device void Species::copyParticlesFromHostToDevice() @@ -1754,33 +1779,22 @@ void Species::sortParticles( Params ¶ms ) // ----------------------------- // GPU version - - // particles_to_move contains, up to here, send particles - // clean it to manage recv particles - particles_to_move->clear(); // Clear on the host - // Merge all MPI_buffer_.partRecv in particles_to_move - for( int idim = 0; idim < params.nDim_field; idim++ ) { - for( int iNeighbor = 0; iNeighbor < 2; iNeighbor++ ) { - int n_part_recv = MPI_buffer_.partRecv[idim][iNeighbor]->size(); - if( n_part_recv != 0 ) { - // insert n_part_recv in particles_to_move from 0 - MPI_buffer_.partRecv[idim][iNeighbor]->copyParticles( 0, - n_part_recv, - *particles_to_move, - particles_to_move->size() ); + + // Merge all MPI_buffer_.partRecv in the first one + Particles * first_buffer = MPI_buffer_.partRecv[0][0]; + for( auto &partRecvs: MPI_buffer_.partRecv ) { + for( auto partRecv: partRecvs ) { + if( partRecv != first_buffer && partRecv->size() > 0 ) { + partRecv->copyParticles( 0, partRecv->size(), *first_buffer, first_buffer->size() ); + partRecv->clear(); } } } - - particles_to_move->copyFromHostToDevice(); - - // // Erase particles that leaves this patch - // particles->last_index[0] = particles->eraseLeavingParticles(); - // - // // Inject newly arrived particles in particles_to_move - // particles->last_index[0] += particles->injectParticles( particles_to_move ); - - particles->importAndSortParticles( particles_to_move ); + + first_buffer->copyFromHostToDevice(); + + particles->importAndSortParticles( first_buffer ); + #else // -------------------------- @@ -1791,24 +1805,6 @@ void Species::sortParticles( Params ¶ms ) int ndim = params.nDim_field; int idim; - // Compute total number of particles received - // int total_number_part_recv = 0; - //Merge all MPI_buffer_.partRecv in particles_to_move - // for( int idim = 0; idim < ndim; idim++ ) { - // for( int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++ ) { - // int n_part_recv = MPI_buffer_.partRecv[idim][iNeighbor]->size(); - // if( ( n_part_recv!=0 ) ) { - // // insert n_part_recv in particles_to_move from 0 - // //MPI_buffer_.partRecv[idim][iNeighbor]->copyParticles( 0, n_part_recv, *particles_to_move, 0 ); - // total_number_part_recv += n_part_recv; - // //particles->last_index[particles->last_index.size()-1] += n_part_recv; - // //particles->cell_keys.resize(particles->cell_keys.size()+n_part_recv); - // } - // } - // } - //cout << "\t Species id : " << species_number_ << " - nparticles recv : " << blabla << endl; - - // Sort to adapt do cell_keys usage std::vector indexes_of_particles_to_exchange; for ( int ipart=0 ; ipart< (int)(getNbrOfParticles()) ; ipart++ ) { diff --git a/src/Species/Species.h b/src/Species/Species.h index b91c9521b..83a2bab9d 100755 --- a/src/Species/Species.h +++ b/src/Species/Species.h @@ -147,8 +147,6 @@ class Species //! Vector containing all Particles of the considered Species Particles *particles; - //! Data structure through which passes particles which move from one patch to another - Particles *particles_to_move; Particles particles_sorted[2]; //std::vector index_of_particles_to_exchange; @@ -344,7 +342,7 @@ class Species // ----------------------------------------------------------------------------- // 5. Methods - virtual void initCluster( Params & ); + virtual void initCluster( Params &, Patch * ); virtual void resizeCluster( Params & ); @@ -386,6 +384,8 @@ class Species #if defined( SMILEI_ACCELERATOR_MODE ) + void allocateParticlesOnDevice(); + //! Copy particles from host to device void copyParticlesFromHostToDevice(); diff --git a/src/Species/SpeciesV.cpp b/src/Species/SpeciesV.cpp index 89d12b340..4a4199b63 100755 --- a/src/Species/SpeciesV.cpp +++ b/src/Species/SpeciesV.cpp @@ -46,7 +46,7 @@ using namespace std; SpeciesV::SpeciesV( Params ¶ms, Patch *patch ) : Species( params, patch ) { - initCluster( params ); + initCluster( params, patch ); npack_ = 0 ; packsize_ = 0; @@ -106,7 +106,7 @@ SpeciesV::~SpeciesV() } -void SpeciesV::initCluster( Params ¶ms ) +void SpeciesV::initCluster( Params ¶ms, Patch *patch ) { int ncells = 1; for( unsigned int iDim=0 ; iDim Date: Thu, 4 Apr 2024 11:24:01 +0200 Subject: [PATCH 07/54] forgot to remove function --- src/Particles/Particles.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Particles/Particles.h b/src/Particles/Particles.h index 13941b40a..a155baf7a 100755 --- a/src/Particles/Particles.h +++ b/src/Particles/Particles.h @@ -476,7 +476,7 @@ class Particles //! Extract particles escaping the box to buffers // ----------------------------------------------------------------------------- virtual void extractParticles( const size_t ndim, const bool copy[], Particles* buffer[] ); -virtual void extractParticles( Particles* particles_to_move ); + // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device // ----------------------------------------------------------------------------- From 3afa356406ed886340af9afec929152fbf277ac3 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Thu, 18 Apr 2024 14:24:02 +0200 Subject: [PATCH 08/54] scatter recvBuffers on CPU instead of GPU --- makefile | 2 +- src/Particles/Particles.cpp | 39 ++++++++++++++- src/Particles/Particles.h | 5 +- src/Particles/nvidiaParticles.cu | 81 +++++++++++++++---------------- src/Particles/nvidiaParticles.h | 11 +++-- src/Patch/Patch.cpp | 3 +- src/Smilei.cpp | 2 +- src/SmileiMPI/AsyncMPIbuffers.cpp | 17 +++++-- src/Species/Species.cpp | 16 +++--- 9 files changed, 110 insertions(+), 66 deletions(-) diff --git a/makefile b/makefile index 3aaff0201..36239640d 100755 --- a/makefile +++ b/makefile @@ -216,7 +216,7 @@ endif ifneq (,$(call parse_config,gpu_amd)) CXXFLAGS += -DSMILEI_ACCELERATOR_MODE GPU_COMPILER ?= $(CC) - GPU_COMPILER_FLAGS += -x hip -DSMILEI_ACCELERATOR_MODE -std=c++14 $(DIRS:%=-I%) #$(PY_FLAGS) + GPU_COMPILER_FLAGS += -x hip -DSMILEI_ACCELERATOR_MODE -std=c++14 $(DIRS:%=-I%) GPU_COMPILER_FLAGS += -I$(BUILD_DIR)/src/Python $(PY_CXXFLAGS) GPU_KERNEL_SRCS := $(shell find src/* -name \*.cu) GPU_KERNEL_OBJS := $(addprefix $(BUILD_DIR)/, $(GPU_KERNEL_SRCS:.cu=.o)) diff --git a/src/Particles/Particles.cpp b/src/Particles/Particles.cpp index b675ac12f..688c53085 100755 --- a/src/Particles/Particles.cpp +++ b/src/Particles/Particles.cpp @@ -1305,8 +1305,37 @@ void Particles::copyFromDeviceToHost() } // Loop all particles and copy the outgoing ones to buffers -void Particles::extractParticles( const size_t /* ndim */, const bool copy[], Particles* buffer[] ) +void Particles::copyLeavingParticlesToBuffers( const bool copy[], Particles* buffer[] ) { + // Leaving particles have a cell_key equal to -2-direction + // where direction goes from 0 to 6 and tells which way the particle escapes. + // If the cell_key is -1, the particle must be destroyed so it is not extracted. + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) + + // GPU + + // Copy leaving particles to buffer[0] on the GPU + copyLeavingParticlesToBuffer( buffer[0] ); + + // Dispatch between the different buffers on the CPU + // (doing this on the GPU is slower; maybe replacing thrust operations with pure cuda would work) + vector indices; + for( size_t ipart = 0; ipart < buffer[0]->size(); ipart++ ) { + int direction = -buffer[0]->cell_keys[ipart] - 2; + if( direction > 0 ) { + if( copy[direction] ) { + buffer[0]->copyParticle( ipart, *buffer[direction] ); + } + indices.push_back( ipart ); + } + } + buffer[0]->eraseParticles( indices ); + +#else + + // CPU + for( size_t ipart = 0; ipart < size(); ipart++ ) { if( cell_keys[ipart] < -1 ) { int direction = -cell_keys[ipart] - 2; @@ -1315,8 +1344,16 @@ void Particles::extractParticles( const size_t /* ndim */, const bool copy[], Pa } } } + +#endif } +void Particles::copyLeavingParticlesToBuffer( Particles* ) +{ + ERROR( "Device only feature, should not have come here!" ); +} + + void Particles::savePositions() { unsigned int ndim = Position.size(), npart = size(); double *p[3], *pold[3]; diff --git a/src/Particles/Particles.h b/src/Particles/Particles.h index a155baf7a..86f9f9cac 100755 --- a/src/Particles/Particles.h +++ b/src/Particles/Particles.h @@ -473,9 +473,10 @@ class Particles // Accelerator specific virtual functions // ----------------------------------------------------------------------------- - //! Extract particles escaping the box to buffers + //! Extract particles leaving the box to buffers // ----------------------------------------------------------------------------- - virtual void extractParticles( const size_t ndim, const bool copy[], Particles* buffer[] ); + void copyLeavingParticlesToBuffers( const bool copy[], Particles* buffer[] ); + virtual void copyLeavingParticlesToBuffer( Particles* buffer ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index 191e0943f..efca22ad5 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -54,6 +54,15 @@ struct cellKeyNegative } }; +struct cellKeyBelowMinus1 +{ + constexpr __host__ __device__ bool + operator()( const int& x ) const + { + return x < -1; + } +}; + namespace detail { //////////////////////////////////////////////////////////////////////////////// @@ -1375,49 +1384,33 @@ unsigned int nvidiaParticles::deviceCapacity() const } // ----------------------------------------------------------------------------- -//! Move escaping particles to the buffers +//! Move leaving particles to the buffer // ----------------------------------------------------------------------------- -void nvidiaParticles::extractParticles( const size_t ndim, const bool copy[], Particles* buffer[] ) +void nvidiaParticles::copyLeavingParticlesToBuffer( Particles* buffer ) { - // Escaping particles have a cell_key equal to -2-direction - // where direction goes from 0 to 6 and tells which way the particle escapes. - // If the cell_key is -1, the particle must be destroyed so it is not extracted. - - extractParticlesByKey<-2>( copy[0], buffer[0] ); // x_min - extractParticlesByKey<-3>( copy[1], buffer[1] ); // x_max - if( ndim > 1 ) { - extractParticlesByKey<-4>( copy[2], buffer[2] ); // y_min - extractParticlesByKey<-5>( copy[3], buffer[3] ); // y_max - if( ndim > 2 ) { - extractParticlesByKey<-6>( copy[4], buffer[4] ); // z_min - extractParticlesByKey<-7>( copy[5], buffer[5] ); // z_max - } - } + copyParticlesByPredicate( buffer, cellKeyBelowMinus1() ); + buffer->copyFromDeviceToHost(); } -//! Copy particles which have cell_key = key -template< const int key> -void nvidiaParticles::extractParticlesByKey( bool copy, Particles* buffer ) +//! Copy particles which statisfy some predicate +template +void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pred ) { // TODO(Etienne M): We are doing extra work. We could use something like // std::partition to output the invalidated particles in buffer // and keep the good ones. This would help us avoid the std::remove_if in // the particle injection and sorting algorithm. - if( ! copy ) { - return; - } - const int nparts = gpu_nparts_; // Iterator of the main data structure // NOTE: https://nvidia.github.io/thrust/api/classes/classthrust_1_1zip__iterator.html#class-thrustzip_iterator const auto source_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), - nvidia_momentum_[0].begin(), - nvidia_momentum_[1].begin(), - nvidia_momentum_[2].begin(), - nvidia_weight_.begin(), - nvidia_charge_.begin() ) ); + nvidia_momentum_[0].begin(), + nvidia_momentum_[1].begin(), + nvidia_momentum_[2].begin(), + nvidia_weight_.begin(), + nvidia_charge_.begin() ) ); const auto source_iterator_last = source_iterator_first + nparts; // std::advance nvidiaParticles* const cp_parts = static_cast( buffer ); @@ -1425,7 +1418,7 @@ void nvidiaParticles::extractParticlesByKey( bool copy, Particles* buffer ) const int nparts_to_copy = thrust::count_if( thrust::device, nvidia_cell_keys_.cbegin(), nvidia_cell_keys_.cbegin() + nparts, - cellKeyEquals() ); + pred ); // Resize it, if too small (copy_if do not resize) cp_parts->resize( nparts_to_copy ); @@ -1443,7 +1436,7 @@ void nvidiaParticles::extractParticlesByKey( bool copy, Particles* buffer ) source_iterator_last, nvidia_cell_keys_.cbegin(), destination_iterator_first, - cellKeyEquals() ); + pred ); // Copy the other position values depending on the simulation's grid dimensions const int ndim_particles = nvidia_position_.size(); @@ -1453,7 +1446,7 @@ void nvidiaParticles::extractParticlesByKey( bool copy, Particles* buffer ) nvidia_position_[i].cbegin() + nparts, nvidia_cell_keys_.cbegin(), cp_parts->nvidia_position_[i].begin(), - cellKeyEquals() ); + pred ); } // Special treatment for chi if radiation emission @@ -1463,7 +1456,7 @@ void nvidiaParticles::extractParticlesByKey( bool copy, Particles* buffer ) nvidia_chi_.cbegin() + nparts, nvidia_cell_keys_.cbegin(), cp_parts->nvidia_chi_.begin(), - cellKeyEquals() ); + pred ); } if( has_Monte_Carlo_process ) { @@ -1472,7 +1465,7 @@ void nvidiaParticles::extractParticlesByKey( bool copy, Particles* buffer ) nvidia_tau_.cbegin() + nparts, nvidia_cell_keys_.cbegin(), cp_parts->nvidia_tau_.begin(), - cellKeyEquals() ); + pred ); } if( tracked ) { @@ -1481,10 +1474,9 @@ void nvidiaParticles::extractParticlesByKey( bool copy, Particles* buffer ) nvidia_id_.cbegin() + nparts, nvidia_cell_keys_.cbegin(), cp_parts->nvidia_id_.begin(), - cellKeyEquals() ); + pred ); } - buffer->copyFromDeviceToHost(); } @@ -1516,14 +1508,19 @@ void nvidiaParticles::extractParticlesByKey( bool copy, Particles* buffer ) //! Erase particles leaving the patch object on device // ----------------------------------------------------------------------------- int nvidiaParticles::eraseLeavingParticles() +{ + return eraseParticlesByPredicate( cellKeyNegative() ); +} + +template +int nvidiaParticles::eraseParticlesByPredicate( Predicate pred ) { const int position_dimension_count = nvidia_position_.size(); const int nparts = gpu_nparts_; const int nparts_to_remove = thrust::count_if( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.begin() + nparts, - cellKeyNegative() ); - + pred ); if( nparts_to_remove > 0 ) { const auto first_particle = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), @@ -1540,7 +1537,7 @@ int nvidiaParticles::eraseLeavingParticles() first_particle, last_particle, nvidia_cell_keys_.cbegin(), - cellKeyNegative() ); + pred ); // Remove the other position values depending on the simulation's grid // dimensions @@ -1549,7 +1546,7 @@ int nvidiaParticles::eraseLeavingParticles() nvidia_position_[i].begin(), nvidia_position_[i].begin() + nparts, nvidia_cell_keys_.cbegin(), - cellKeyNegative() ); + pred ); } if( has_quantum_parameter ) { @@ -1557,7 +1554,7 @@ int nvidiaParticles::eraseLeavingParticles() nvidia_chi_.begin(), nvidia_chi_.begin() + nparts, nvidia_cell_keys_.cbegin(), - cellKeyNegative() ); + pred ); } if( has_Monte_Carlo_process ) { @@ -1565,7 +1562,7 @@ int nvidiaParticles::eraseLeavingParticles() nvidia_tau_.begin(), nvidia_tau_.begin() + nparts, nvidia_cell_keys_.cbegin(), - cellKeyNegative() ); + pred ); } if( tracked ) { @@ -1573,7 +1570,7 @@ int nvidiaParticles::eraseLeavingParticles() nvidia_id_.begin(), nvidia_id_.begin() + nparts, nvidia_cell_keys_.cbegin(), - cellKeyNegative() ); + pred ); } // Update current number of particles diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index 64164fad7..ba689f1e8 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -113,18 +113,21 @@ class nvidiaParticles : public Particles }; // ----------------------------------------------------------------------------- - //! Move escaping particles to the buffers + //! Move leaving particles to the buffers // ----------------------------------------------------------------------------- - void extractParticles( const size_t ndim, const bool copy[], Particles* buffer[] ) override; + void copyLeavingParticlesToBuffer( Particles* buffer ) override; - template< const int key> - void extractParticlesByKey( bool copy, Particles* buffer ); + template + void copyParticlesByPredicate( Particles* buffer, Predicate pred ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device and returns the number of particle removed // ----------------------------------------------------------------------------- int eraseLeavingParticles() override; + template + int eraseParticlesByPredicate( Predicate pred ); + // ----------------------------------------------------------------------------- //! Inject particles from particles_to_inject into *this and return the number of particle added // ----------------------------------------------------------------------------- diff --git a/src/Patch/Patch.cpp b/src/Patch/Patch.cpp index d61c1f9e1..585f76f97 100755 --- a/src/Patch/Patch.cpp +++ b/src/Patch/Patch.cpp @@ -539,6 +539,7 @@ void Patch::copyExchParticlesToBuffers( int ispec, Params ¶ms ) cleanMPIBuffers( ispec, params ); + // Make a list of buffers bool copy[params.nDim_field*2]; Particles* sendBuffer[params.nDim_field*2]; for( size_t iDim = 0; iDim < params.nDim_field; iDim++ ) { @@ -552,7 +553,7 @@ void Patch::copyExchParticlesToBuffers( int ispec, Params ¶ms ) copy[1] = copy[1] && ( Pcoordinates[0]!=params.number_of_patches[0]-1 || vecSpecies[ispec]->boundary_conditions_[0][1]=="periodic" ); } - part.extractParticles( params.nDim_field, copy, sendBuffer ); + part.copyLeavingParticlesToBuffers( copy, sendBuffer ); } // copyExchParticlesToBuffers(... iDim) diff --git a/src/Smilei.cpp b/src/Smilei.cpp index 0ab0db1a2..eae1993d9 100755 --- a/src/Smilei.cpp +++ b/src/Smilei.cpp @@ -124,7 +124,7 @@ int main( int argc, char *argv[] ) // oblivious to the program (only one, the one by default). // This could be a missed but very advanced optimization for some // kernels/exchange. - ERROR( "Simlei needs only one accelerator (GPU). Look for HIP_VISIBLE_DEVICES or 'gpu-bind=closest' in your SLURM script or use a custom binding script." ); + ERROR( "Smilei needs only one accelerator (GPU). Look for HIP_VISIBLE_DEVICES or 'gpu-bind=closest' in your SLURM script or use a custom binding script." ); } else { // ::omp_set_default_device(0); } diff --git a/src/SmileiMPI/AsyncMPIbuffers.cpp b/src/SmileiMPI/AsyncMPIbuffers.cpp index 4cb283d17..ff8efb17f 100755 --- a/src/SmileiMPI/AsyncMPIbuffers.cpp +++ b/src/SmileiMPI/AsyncMPIbuffers.cpp @@ -93,12 +93,21 @@ void SpeciesMPIbuffers::allocate( Params ¶ms, Patch *patch ) partRecvSize[i].resize( 2 ); partSendSize[i].resize( 2 ); + // NOTE: send/recv buffers on xmin / xmax use a different constructor because + // they must be sent on GPU for exchanging particles partRecv[i].resize( 2 ); - partRecv[i][0] = ParticlesFactory::create( params, *patch );; - partRecv[i][1] = ParticlesFactory::create( params, *patch );; partSend[i].resize( 2 ); - partSend[i][0] = ParticlesFactory::create( params, *patch );; - partSend[i][1] = ParticlesFactory::create( params, *patch );; + if( i == 0 ) { + partRecv[i][0] = ParticlesFactory::create( params, *patch ); + partRecv[i][1] = ParticlesFactory::create( params, *patch ); + partSend[i][0] = ParticlesFactory::create( params, *patch ); + partSend[i][1] = ParticlesFactory::create( params, *patch ); + } else { + partRecv[i][0] = new Particles(); + partRecv[i][1] = new Particles(); + partSend[i][0] = new Particles(); + partSend[i][1] = new Particles(); + } } } diff --git a/src/Species/Species.cpp b/src/Species/Species.cpp index bfc1ae036..65358f555 100755 --- a/src/Species/Species.cpp +++ b/src/Species/Species.cpp @@ -632,16 +632,12 @@ Species::deleteSpeciesCurrentAndChargeOnDevice( void Species::allocateParticlesOnDevice() { particles->initializeDataOnDevice(); - for( auto partSends: MPI_buffer_.partSend ) { - for( auto partSend: partSends ) { - partSend->initializeDataOnDevice(); - } - } - for( auto partRecvs: MPI_buffer_.partRecv ) { - for( auto partRecv: partRecvs ) { - partRecv->initializeDataOnDevice(); - } - } + + // The first send/recv buffers are also on device + MPI_buffer_.partSend[0][0]->initializeDataOnDevice(); + MPI_buffer_.partSend[0][1]->initializeDataOnDevice(); + MPI_buffer_.partRecv[0][0]->initializeDataOnDevice(); + MPI_buffer_.partRecv[0][1]->initializeDataOnDevice(); // Create photon species on the device if( radiation_model_ == "mc" && photon_species_ ) { From ccac4ba75a1c2af9a183508be5448972f201e9cb Mon Sep 17 00:00:00 2001 From: cprouveur Date: Thu, 18 Apr 2024 17:02:26 +0200 Subject: [PATCH 09/54] Implentation of GPU acceleration for the 1D cartesian geometry --- src/Checkpoint/Checkpoint.cpp | 8 +- src/ElectroMagn/ElectroMagn1D.cpp | 67 +- src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp | 60 +- src/ElectroMagnBC/ElectroMagnBC1D_SM.h | 6 +- src/ElectroMagnSolver/MA_Solver1D_norm.cpp | 91 +- src/ElectroMagnSolver/MF_Solver1D_Yee.cpp | 53 +- src/Interpolator/Interpolator1D.cpp | 2 +- src/Interpolator/Interpolator1D.h | 2 +- src/Interpolator/Interpolator1D2Order.cpp | 448 +++++-- src/Interpolator/Interpolator1D2Order.h | 114 +- src/Interpolator/Interpolator1D2OrderV.cpp | 10 +- src/Interpolator/Interpolator1D2OrderV.h | 4 +- src/Interpolator/Interpolator1D3Order.h | 8 +- src/Interpolator/Interpolator1D4Order.h | 88 +- src/Interpolator/Interpolator1DWT2Order.cpp | 6 +- src/Interpolator/Interpolator1DWT2Order.h | 4 +- src/Interpolator/Interpolator1DWT2OrderV.cpp | 10 +- src/Interpolator/Interpolator1DWT2OrderV.h | 4 +- src/Interpolator/Interpolator1DWT4Order.h | 4 +- src/Interpolator/InterpolatorFactory.h | 10 + src/Params/Params.h | 2 +- src/Particles/nvidiaParticles.cu | 308 ++++- src/Patch/SyncVectorPatch.cpp | 2 +- src/Projector/Projector1D.h | 10 +- src/Projector/Projector1D2Order.cpp | 52 +- src/Projector/Projector1D2Order.h | 5 +- src/Projector/Projector1D2OrderGPU.cpp | 385 ++++++ src/Projector/Projector1D2OrderGPU.h | 127 ++ .../Projector1D2OrderGPUKernelCUDAHIP.cu | 1103 +++++++++++++++++ .../Projector1D2OrderGPUKernelCUDAHIP.h | 71 ++ src/Projector/Projector1D4Order.cpp | 20 +- src/Projector/Projector1D4Order.h | 1 - src/Projector/Projector2D2OrderGPU.cpp | 34 +- src/Projector/Projector2D2OrderGPU.h | 2 +- src/Projector/Projector2D2OrderGPUKernel.cpp | 12 +- .../Projector2D2OrderGPUKernelCUDAHIP.cu | 64 +- .../Projector2D2OrderGPUKernelCUDAHIP.h | 10 +- src/Projector/Projector3D2OrderGPU.cpp | 20 +- src/Projector/Projector3D2OrderGPU.h | 2 +- .../Projector3D2OrderGPUKernelCUDAHIP.cu | 20 +- .../Projector3D2OrderGPUKernelCUDAHIP.h | 14 +- src/SmileiMPI/SmileiMPI.cpp | 8 +- 42 files changed, 2805 insertions(+), 466 deletions(-) mode change 100644 => 100755 src/Interpolator/Interpolator1D2OrderV.cpp mode change 100644 => 100755 src/Interpolator/Interpolator1D2OrderV.h mode change 100644 => 100755 src/Particles/nvidiaParticles.cu create mode 100755 src/Projector/Projector1D2OrderGPU.cpp create mode 100755 src/Projector/Projector1D2OrderGPU.h create mode 100755 src/Projector/Projector1D2OrderGPUKernelCUDAHIP.cu create mode 100755 src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h mode change 100644 => 100755 src/Projector/Projector2D2OrderGPUKernel.cpp mode change 100644 => 100755 src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu mode change 100644 => 100755 src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h mode change 100644 => 100755 src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu mode change 100644 => 100755 src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h diff --git a/src/Checkpoint/Checkpoint.cpp b/src/Checkpoint/Checkpoint.cpp index 13c3d28a5..97d43c258 100755 --- a/src/Checkpoint/Checkpoint.cpp +++ b/src/Checkpoint/Checkpoint.cpp @@ -478,8 +478,8 @@ void Checkpoint::dumpPatch( Patch *patch, Params ¶ms, H5Write &g ) name << setfill( '0' ) << setw( 2 ) << bcId; string groupName=Tools::merge( "EM_boundary-species-", name.str() ); H5Write b = g.group( groupName ); - b.attr( "By_val", embc->By_val ); - b.attr( "Bz_val", embc->Bz_val ); + b.attr( "By_val", embc->By_val_ ); + b.attr( "Bz_val", embc->Bz_val_ ); } else if( dynamic_cast( EMfields->emBoundCond[bcId] ) ) { ElectroMagnBC2D_SM *embc = static_cast( EMfields->emBoundCond[bcId] ); ostringstream name( "" ); @@ -889,8 +889,8 @@ void Checkpoint::restartPatch( Patch *patch, Params ¶ms, H5Read &g ) name << setfill( '0' ) << setw( 2 ) << bcId; string groupName = Tools::merge( "EM_boundary-species-", name.str() ); H5Read b = g.group( groupName ); - b.attr( "By_val", embc->By_val ); - b.attr( "Bz_val", embc->Bz_val ); + b.attr( "By_val", embc->By_val_ ); + b.attr( "Bz_val", embc->Bz_val_ ); } else if( dynamic_cast( EMfields->emBoundCond[bcId] ) ) { ElectroMagnBC2D_SM *embc = static_cast( EMfields->emBoundCond[bcId] ); ostringstream name( "" ); diff --git a/src/ElectroMagn/ElectroMagn1D.cpp b/src/ElectroMagn/ElectroMagn1D.cpp index ea97df8fb..d9ecbe478 100755 --- a/src/ElectroMagn/ElectroMagn1D.cpp +++ b/src/ElectroMagn/ElectroMagn1D.cpp @@ -559,34 +559,65 @@ void ElectroMagn1D::saveMagneticFields( bool is_spectral ) void ElectroMagn1D::centerMagneticFields() { // Static cast of the fields - Field1D *Bx1D = static_cast( Bx_ ); - Field1D *By1D = static_cast( By_ ); - Field1D *Bz1D = static_cast( Bz_ ); - Field1D *Bx1D_m = static_cast( Bx_m ); - Field1D *By1D_m = static_cast( By_m ); - Field1D *Bz1D_m = static_cast( Bz_m ); + const double *const __restrict__ Bx1D = Bx_->data(); + const double *const __restrict__ By1D = By_->data(); + const double *const __restrict__ Bz1D = Bz_->data(); + double *const __restrict__ Bx1D_m = Bx_m->data(); + double *const __restrict__ By1D_m = By_m->data(); + double *const __restrict__ Bz1D_m = Bz_m->data(); + const unsigned int nx_p = dimPrim[0]; + const unsigned int nx_d = dimDual[0]; + // for Bx^(p) - for( unsigned int i=0 ; isize(); + const int sizeofBy = By_->size(); + const int sizeofBz = Bz_->size(); + + #pragma acc parallel present(Bx1D[0:sizeofBx],Bx1D_m[0:sizeofBx]) + #pragma acc loop gang worker vector +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target + #pragma omp teams distribute parallel for //simd +#endif + for( unsigned int i=0 ; i( By_mBTIS3 ); - Field1D *Bz_oldBTIS3 = static_cast( Bz_mBTIS3 ); - - for( unsigned int i=0 ; idata(); + double *const Bz1D_oldBTIS3 = Bz_mBTIS3->data(); +#if defined( SMILEI_OPENACC_MODE ) + const int sizeofByBTIS3 = By_mBTIS3->size(); + const int sizeofBzBTIS3 = Bz_mBTIS3->size(); + #pragma acc parallel present(By1D_oldBTIS3[0:sizeofByBTIS3],By1D[0:sizeofBy],Bz1D_oldBTIS3[0:sizeofBzBTIS3],Bz1D[0:sizeofBz]) + #pragma acc loop gang vector +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target + #pragma omp teams distribute parallel for +#endif +#if !defined( SMILEI_ACCELERATOR_MODE ) + #pragma omp simd +#endif + for( unsigned int i=0 ; iisXmin() ) { if( field1D->name=="By" ) { - By_val = ( *my_field )( 0 ); + By_val_ = ( *my_field )( 0 ); } else if( field1D->name=="Bz" ) { - Bz_val = ( *my_field )( 0 ); + Bz_val_ = ( *my_field )( 0 ); } } else if( i_boundary_ == 1 && patch->isXmax() ) { if( field1D->name=="By" ) { - By_val = ( *my_field )( field1D->dims()[0]-1 ); + By_val_ = ( *my_field )( field1D->dims()[0]-1 ); } else if( field1D->name=="Bz" ) { - Bz_val = ( *my_field )( field1D->dims()[0]-1 ); + Bz_val_ = ( *my_field )( field1D->dims()[0]-1 ); } } @@ -74,11 +74,17 @@ void ElectroMagnBC1D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * if( patch->isBoundary( i_boundary_ ) ) { //Field1D* Ex1D = static_cast(EMfields->Ex_); - Field1D *Ey1D = static_cast( EMfields->Ey_ ); + /*Field1D *Ey1D = static_cast( EMfields->Ey_ ); Field1D *Ez1D = static_cast( EMfields->Ez_ ); Field1D *By1D = static_cast( EMfields->By_ ); - Field1D *Bz1D = static_cast( EMfields->Bz_ ); + Field1D *Bz1D = static_cast( EMfields->Bz_ );*/ + const Field *E[3]{ EMfields->Ex_, EMfields->Ey_, EMfields->Ez_ }; + const Field *B[3]{ EMfields->Bx_, EMfields->By_, EMfields->Bz_ }; + const double *const __restrict__ E1 = E[1]->data_; + const double *const __restrict__ E2 = E[2]->data_; + double *const __restrict__ B1 = B[1]->data_; + double *const __restrict__ B2 = B[2]->data_; // Lasers double by = 0., bz = 0.; vector pos( 1 ); @@ -88,11 +94,25 @@ void ElectroMagnBC1D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * bz += vecLaser[ilaser]->getAmplitude1( pos, time_dual, 0, 0 ); } +#ifdef SMILEI_OPENACC_MODE + const int sizeofE1 = E[1]->number_of_points_; + const int sizeofE2 = E[2]->number_of_points_; + const int sizeofB1 = B[1]->number_of_points_; + const int sizeofB2 = B[2]->number_of_points_; +#endif // Apply Silver-Mueller EM boundary condition at x=xmin or xmax - ( *By1D )( iB ) = -sign_*Alpha*( *Ez1D )( iE ) + Beta*( ( *By1D )( iB_old )-By_val ) + Gamma*by + By_val; - ( *Bz1D )( iB ) = sign_*Alpha*( *Ey1D )( iE ) + Beta*( ( *Bz1D )( iB_old )-Bz_val ) + Gamma*bz + Bz_val; - +#ifdef SMILEI_OPENACC_MODE + #pragma acc parallel present(E1[0:sizeofE1],E2[0:sizeofE2],B1[0:sizeofB1],B2[0:sizeofB2]) +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target +#endif + { + //( *By1D )( iB_ ) = -sign_*Alpha_*( *Ez1D )( iE_ ) + Beta_*( ( *By1D )( iB_old_ )-By_val_ ) + Gamma_*by + By_val_; + //( *Bz1D )( iB_ ) = sign_*Alpha_*( *Ey1D )( iE_ ) + Beta_*( ( *Bz1D )( iB_old_ )-Bz_val_ ) + Gamma_*bz + Bz_val_; + B1[ iB_ ] = -sign_ * Alpha_ * E2[iE_] + Beta_ * ( B1[iB_old_] - By_val_) + Gamma_ * by + By_val_; + B2[ iB_ ] = -sign_ * Alpha_ * E1[iE_] + Beta_ * ( B2[iB_old_] - Bz_val_) + Gamma_ * bz + Bz_val_; + } } } diff --git a/src/ElectroMagnBC/ElectroMagnBC1D_SM.h b/src/ElectroMagnBC/ElectroMagnBC1D_SM.h index ac17f856d..ccbc499c1 100755 --- a/src/ElectroMagnBC/ElectroMagnBC1D_SM.h +++ b/src/ElectroMagnBC/ElectroMagnBC1D_SM.h @@ -17,16 +17,16 @@ class ElectroMagnBC1D_SM : public ElectroMagnBC1D void save_fields( Field *, Patch *patch ) override; - double By_val, Bz_val; + double By_val_, Bz_val_; private: //! Constants used for the Silver-Mueller boundary conditions - double Alpha, Beta, Gamma; + double Alpha_, Beta_, Gamma_; //! Locations to apply the profile - unsigned int iE, iB, iB_old; + unsigned int iE_, iB_, iB_old_; int sign_; }; diff --git a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp index 7e04123f4..9b9f0d53d 100755 --- a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp +++ b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp @@ -15,28 +15,101 @@ MA_Solver1D_norm::~MA_Solver1D_norm() void MA_Solver1D_norm::operator()( ElectroMagn *fields ) { + { const unsigned int nx_p = fields->dimPrim[0]; const unsigned int nx_d = fields->dimDual[0]; - Field1D *Ex1D = static_cast( fields->Ex_ ); + /*Field1D *Ex1D = static_cast( fields->Ex_ ); Field1D *Ey1D = static_cast( fields->Ey_ ); Field1D *Ez1D = static_cast( fields->Ez_ ); Field1D *By1D = static_cast( fields->By_ ); Field1D *Bz1D = static_cast( fields->Bz_ ); Field1D *Jx1D = static_cast( fields->Jx_ ); Field1D *Jy1D = static_cast( fields->Jy_ ); - Field1D *Jz1D = static_cast( fields->Jz_ ); - + Field1D *Jz1D = static_cast( fields->Jz_ );*/ + + double *const __restrict__ Ex1D = fields->Ex_->data(); // [x] : dual in x primal in y,z + double *const __restrict__ Ey1D = fields->Ey_->data(); // [x] : dual in y primal in x,z + double *const __restrict__ Ez1D = fields->Ez_->data(); // [x] : dual in z primal in x,y + //const double *const __restrict__ Bx1D = fields->Bx_->data(); // [x] : dual in y,z primal in x + const double *const __restrict__ By1D = fields->By_->data(); // [x] : dual in x,z primal in y + const double *const __restrict__ Bz1D = fields->Bz_->data(); // [x] : dual in x,y primal in z + const double *const __restrict__ Jx1D = fields->Jx_->data(); // [x] : dual in x primal in y,z + const double *const __restrict__ Jy1D = fields->Jy_->data(); // [x] : dual in y primal in x,z + const double *const __restrict__ Jz1D = fields->Jz_->data(); // [x] : dual in z primal in x,y + + { + fields->Ex_->copyFromDeviceToHost(); + fields->Ey_->copyFromDeviceToHost(); + fields->Ez_->copyFromDeviceToHost(); + fields->Jx_->copyFromDeviceToHost(); + fields->Jy_->copyFromDeviceToHost(); + fields->Jz_->copyFromDeviceToHost(); + } + std::cout<< "printing before in MA solver ex, ey and ez for nx_d="<Ex_->number_of_points_; + const int sizeofEy = fields->Ey_->number_of_points_; + const int sizeofEz = fields->Ez_->number_of_points_; + //const int sizeofBx = fields->Bx_->number_of_points_; + const int sizeofBy = fields->By_->number_of_points_; + const int sizeofBz = fields->Bz_->number_of_points_; + #pragma acc parallel present( Ex1D[0:sizeofEx], Jx1D[0:sizeofEx] ) + #pragma acc loop gang worker vector +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target + #pragma omp teams distribute parallel for +#endif +#if !defined( SMILEI_ACCELERATOR_MODE ) + #pragma omp simd +#endif + for( unsigned int ix=0 ; ixEx_->copyFromDeviceToHost(); + fields->Ey_->copyFromDeviceToHost(); + fields->Ez_->copyFromDeviceToHost(); } - // Transverse fields ey, ez are defined on the primal grid - for( unsigned int ix=0 ; ixdimPrim[0]; + const unsigned int nx_d = fields->dimDual[0]; + double *const __restrict__ Ex1D = fields->Ex_->data(); // [x] : dual in x primal in y,z + double *const __restrict__ Ey1D = fields->Ey_->data(); // [x] : dual in y primal in x,z + double *const __restrict__ Ez1D = fields->Ez_->data(); // [x] : dual in z primal in x,y + + std::cout<< "printing after in MA solver ex, ey and ez for nx_d="<dimDual[0]; // Static-cast of the fields - Field1D* Ey1D; + /*Field1D* Ey1D; Field1D* Ez1D; if (isEFilterApplied) { Ey1D = static_cast(fields->filter_->Ey_[0]); @@ -28,17 +28,56 @@ void MF_Solver1D_Yee::operator()( ElectroMagn *fields ) } else { Ey1D = static_cast(fields->Ey_); Ez1D = static_cast(fields->Ez_); - } - Field1D *By1D = static_cast( fields->By_ ); - Field1D *Bz1D = static_cast( fields->Bz_ ); + }*/ + const double *const __restrict__ Ey1D = isEFilterApplied ? fields->filter_->Ey_[0]->data() : + fields->Ey_->data(); // [ix] : dual in y primal in x,z + const double *const __restrict__ Ez1D = isEFilterApplied ? fields->filter_->Ez_[0]->data() : + fields->Ez_->data();// [ix] : dual in z primal in x,y + + //Field1D *By1D = static_cast( fields->By_ ); + //Field1D *Bz1D = static_cast( fields->Bz_ ); + double *const __restrict__ By1D = fields->By_->data();// [ix] : dual in x,z primal in y + double *const __restrict__ Bz1D = fields->Bz_->data();// [ix] : dual in x,y primal in z + + // to be deleted + /*std::cout<< "printing before in FM solver by and bz for nx_d-1="<Ey_->number_of_points_; + const int sizeofEz = fields->Ez_->number_of_points_; + const int sizeofBy = fields->By_->number_of_points_; + const int sizeofBz = fields->Bz_->number_of_points_; + #pragma acc parallel present( By1D[0:sizeofBy], Bz1D[0:sizeofBz],Ey1D[0:sizeofEy],Ez1D[0:sizeofEz] ) + #pragma acc loop gang vector +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target + #pragma omp teams distribute parallel for +#endif +#if !defined( SMILEI_ACCELERATOR_MODE ) + #pragma omp simd +#endif for( unsigned int ix=1 ; ixBy_->copyFromDeviceToHost(); + fields->Bz_->copyFromDeviceToHost(); } + std::cout<< "printing after in FM solver by and bz for nx_d-1="<getCellStartingGlobalIndex( 0 ); + i_domain_begin_ = patch->getCellStartingGlobalIndex( 0 ); } diff --git a/src/Interpolator/Interpolator1D.h b/src/Interpolator/Interpolator1D.h index c1324e0a3..408b6ac3a 100755 --- a/src/Interpolator/Interpolator1D.h +++ b/src/Interpolator/Interpolator1D.h @@ -22,7 +22,7 @@ class Interpolator1D : public Interpolator protected: //! Inverse of the spatial-step double dx_inv_; - unsigned int index_domain_begin; + unsigned int i_domain_begin_; }; #endif diff --git a/src/Interpolator/Interpolator1D2Order.cpp b/src/Interpolator/Interpolator1D2Order.cpp index e867b29be..a74c951cd 100755 --- a/src/Interpolator/Interpolator1D2Order.cpp +++ b/src/Interpolator/Interpolator1D2Order.cpp @@ -14,7 +14,6 @@ using namespace std; Interpolator1D2Order::Interpolator1D2Order( Params ¶ms, Patch *patch ) : Interpolator1D( patch ) { dx_inv_ = 1.0/params.cell_length[0]; - } // --------------------------------------------------------------------------------------------------------------------- @@ -23,31 +22,27 @@ Interpolator1D2Order::Interpolator1D2Order( Params ¶ms, Patch *patch ) : Int void Interpolator1D2Order::fields( ElectroMagn *EMfields, Particles &particles, int ipart, int nparts, double *ELoc, double *BLoc ) { // Static cast of the electromagnetic fields - Field1D *Ex1D = static_cast( EMfields->Ex_ ); - Field1D *Ey1D = static_cast( EMfields->Ey_ ); - Field1D *Ez1D = static_cast( EMfields->Ez_ ); - Field1D *Bx1D_m = static_cast( EMfields->Bx_m ); - Field1D *By1D_m = static_cast( EMfields->By_m ); - Field1D *Bz1D_m = static_cast( EMfields->Bz_m ); + Field1D *Ex1D = static_cast( EMfields->Ex_ ); + Field1D *Ey1D = static_cast( EMfields->Ey_ ); + Field1D *Ez1D = static_cast( EMfields->Ez_ ); + Field1D *Bx1D_m = static_cast( EMfields->Bx_m ); + Field1D *By1D_m = static_cast( EMfields->By_m ); + Field1D *Bz1D_m = static_cast( EMfields->Bz_m ); // Particle position (in units of the spatial-step) - double xpn = particles.position( 0, ipart )*dx_inv_; + double xjn = particles.position( 0, ipart ) * dx_inv_; // Calculate coeffs - int idx_p[1], idx_d[1]; - double delta_p[1]; - double coeffxp[3]; - double coeffxd[3]; - coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + coeffs( xjn ); // Interpolate the fields from the Dual grid : Ex, By, Bz - *( ELoc+0*nparts ) = compute( coeffxd, Ex1D, idx_d[0] ); - *( BLoc+1*nparts ) = compute( coeffxd, By1D_m, idx_d[0] ); - *( BLoc+2*nparts ) = compute( coeffxd, Bz1D_m, idx_d[0] ); + *( ELoc+0*nparts ) = compute( coeffd_, Ex1D, id_ ); + *( BLoc+1*nparts ) = compute( coeffd_, By1D_m, id_ ); + *( BLoc+2*nparts ) = compute( coeffd_, Bz1D_m, id_ ); // Interpolate the fields from the Primal grid : Ey, Ez, Bx - *( ELoc+1*nparts ) = compute( coeffxp, Ey1D, idx_p[0] ); - *( ELoc+2*nparts ) = compute( coeffxp, Ez1D, idx_p[0] ); - *( BLoc+0*nparts ) = compute( coeffxp, Bx1D_m, idx_p[0] ); + *( ELoc+1*nparts ) = compute( coeffp_, Ey1D, ip_ ); + *( ELoc+2*nparts ) = compute( coeffp_, Ez1D, ip_ ); + *( BLoc+0*nparts ) = compute( coeffp_, Bx1D_m, ip_ ); }//END Interpolator1D2Order @@ -83,37 +78,33 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & } // Particle position (in units of the spatial-step) - double xpn = particles.position( 0, ipart )*dx_inv_; + double xjn = particles.position( 0, ipart )*dx_inv_; // Calculate coeffs - int idx_p[1], idx_d[1]; - double delta_p[1]; - double coeffxp[3]; - double coeffxd[3]; - coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + coeffs( xjn ); int nparts( particles.numberOfParticles() ); // Interpolate the fields from the Dual grid : Ex, By, Bz - *( ELoc+0*nparts ) = compute( coeffxd, Ex1D, idx_d[0] ); - *( BLoc+1*nparts ) = compute( coeffxd, By1D_m, idx_d[0] ); - *( BLoc+2*nparts ) = compute( coeffxd, Bz1D_m, idx_d[0] ); + *( ELoc+0*nparts ) = compute( coeffd_, Ex1D, id_ ); + *( BLoc+1*nparts ) = compute( coeffd_, By1D_m, id_ ); + *( BLoc+2*nparts ) = compute( coeffd_, Bz1D_m, id_ ); // Interpolate the fields from the Primal grid : Ey, Ez, Bx - *( ELoc+1*nparts ) = compute( coeffxp, Ey1D, idx_p[0] ); - *( ELoc+2*nparts ) = compute( coeffxp, Ez1D, idx_p[0] ); - *( BLoc+0*nparts ) = compute( coeffxp, Bx1D_m, idx_p[0] ); + *( ELoc+1*nparts ) = compute( coeffp_, Ey1D, ip_ ); + *( ELoc+2*nparts ) = compute( coeffp_, Ez1D, ip_ ); + *( BLoc+0*nparts ) = compute( coeffp_, Bx1D_m, ip_ ); // Interpolate the fields from the Primal grid : Jy, Jz, Rho - JLoc->y = compute( coeffxp, Jy1D, idx_p[0] ); - JLoc->z = compute( coeffxp, Jz1D, idx_p[0] ); - ( *RhoLoc ) = compute( coeffxp, Rho1D, idx_p[0] ); + JLoc->y = compute( coeffp_, Jy1D, ip_ ); + JLoc->z = compute( coeffp_, Jz1D, ip_ ); + ( *RhoLoc ) = compute( coeffp_, Rho1D, ip_ ); // Interpolate the fields from the Dual grid : Jx - JLoc->x = compute( coeffxd, Jx1D, idx_d[0] ); + JLoc->x = compute( coeffd_, Jx1D, id_ ); if (smpi->use_BTIS3){ - *( BLocyBTIS3+0*nparts ) = compute( coeffxp, By1DBTIS3, idx_p[0] ); - *( BLoczBTIS3+0*nparts ) = compute( coeffxp, Bz1DBTIS3, idx_p[0] ); + *( BLocyBTIS3+0*nparts ) = compute( &coeffp_[1], By1DBTIS3, ip_ ); + *( BLoczBTIS3+0*nparts ) = compute( &coeffp_[1], Bz1DBTIS3, ip_ ); } } @@ -122,114 +113,269 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & void Interpolator1D2Order::oneField( Field **field, Particles &particles, int *istart, int *iend, double *FieldLoc, double *, double *, double * ) { Field1D *F = static_cast( *field ); - int idx_p[1], idx_d[1]; - double delta_p[1]; - double coeffxp[3]; - double coeffxd[3]; - double *coeff = F->isDual( 0 ) ? coeffxd : coeffxp; - int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; + double *coeff = F->isDual( 0 ) ? coeffd_ : coeffp_; + int *i = F->isDual( 0 ) ? &id_ : &ip_; for( int ipart=*istart ; ipart<*iend; ipart++ ) { - double xpn = particles.position( 0, ipart )*dx_inv_; - coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + double xjn = particles.position( 0, ipart )*dx_inv_; + coeffs( xjn ); FieldLoc[ipart] = compute( coeff, F, *i ); } } -void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, unsigned int, int ) +void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, + Particles &particles, SmileiMPI *smpi, + int *istart, int *iend, int ithread, unsigned int, int ) { - double *Epart = &( smpi->dynamics_Epart[ithread][0] ); - double *Bpart = &( smpi->dynamics_Bpart[ithread][0] ); - int *iold = &( smpi->dynamics_iold[ithread][0] ); - double *delta = &( smpi->dynamics_deltaold[ithread][0] ); + { + double *const __restrict__ ELoc = smpi->dynamics_Epart[ithread].data();//&( smpi->dynamics_Epart[ithread][0] ); + double *const __restrict__ BLoc = smpi->dynamics_Bpart[ithread].data();//&( smpi->dynamics_Bpart[ithread][0] ); - // Static cast of the electromagnetic fields - Field1D *Ex1D = static_cast( EMfields->Ex_ ); - Field1D *Ey1D = static_cast( EMfields->Ey_ ); - Field1D *Ez1D = static_cast( EMfields->Ez_ ); - Field1D *Bx1D = static_cast( EMfields->Bx_m ); - Field1D *By1D = static_cast( EMfields->By_m ); - Field1D *Bz1D = static_cast( EMfields->Bz_m ); + int *const __restrict__ iold = smpi->dynamics_iold[ithread].data();//&( smpi->dynamics_iold[ithread][0] ); + double *const __restrict__ delta = smpi->dynamics_deltaold[ithread].data();//&( smpi->dynamics_deltaold[ithread][0] ); + const double *const __restrict__ position_x = particles.getPtrPosition( 0 ); + + // Static cast of the electromagnetic fields + const double *const __restrict__ Ex1D = static_cast( EMfields->Ex_ )->data(); + const double *const __restrict__ Ey1D = static_cast( EMfields->Ey_ )->data(); + const double *const __restrict__ Ez1D = static_cast( EMfields->Ez_ )->data(); + const double *const __restrict__ Bx1D = static_cast( EMfields->Bx_m )->data(); + const double *const __restrict__ By1D = static_cast( EMfields->By_m )->data(); + const double *const __restrict__ Bz1D = static_cast( EMfields->Bz_m )->data(); + +#if defined(SMILEI_OPENACC_MODE) + const int sizeofEx = EMfields->Ex_->size(); + const int sizeofEy = EMfields->Ey_->size(); + const int sizeofEz = EMfields->Ez_->size(); + const int sizeofBx = EMfields->Bx_m->size(); + const int sizeofBy = EMfields->By_m->size(); + const int sizeofBz = EMfields->Bz_m->size(); +#endif //Loop on bin particles - int nparts = particles.numberOfParticles(); + const int nparts = particles.numberOfParticles(); + const int first_index = *istart; + const int last_index = *iend; + double accdx_inv[2]; + accdx_inv[0]= dx_inv_; + /*std::cout<< "printing before in interpolator ex, ey and ez then bx,by,bz" <use_BTIS3){ // without BTIS-3 interpolation + EMfields->Ex_->copyFromDeviceToHost(); + EMfields->Ey_->copyFromDeviceToHost(); + EMfields->Ez_->copyFromDeviceToHost(); + EMfields->Jx_->copyFromDeviceToHost(); + EMfields->Jy_->copyFromDeviceToHost(); + EMfields->Jz_->copyFromDeviceToHost(); + } + std::cout<< "printing before in interpolator after copyFromDeviceToHost ex, ey and ez then bx,by,bz" <dynamics_Epart[ithread] )[0*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Epart[ithread] )[1*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Epart[ithread] )[2*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[0*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[1*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[2*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_iold[ithread] )[0] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_deltaold[ithread] )[0]), nparts ); + + + + std::cout<<"print in interpolator fields wrapper eloc before computation and after CopyDeviceToHost"<use_BTIS3){ + //for (int ipart=*istart; ipart < *iend; ipart++){ +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target map( to : i_domain_begin_) is_device_ptr (position_x) + #pragma omp teams distribute parallel for +#elif defined(SMILEI_OPENACC_MODE) + #pragma acc enter data create(this) + #pragma acc update device(this) + size_t interpolation_range_size = ( last_index + 0 * nparts ) - first_index; + #pragma acc parallel present(ELoc [first_index:interpolation_range_size],\ + BLoc [first_index:interpolation_range_size],\ + iold [first_index:interpolation_range_size],\ + delta [first_index:interpolation_range_size],\ + Ex1D [0:sizeofEx],\ + Ey1D [0:sizeofEy],\ + Ez1D [0:sizeofEz],\ + Bx1D [0:sizeofBx],\ + By1D [0:sizeofBy],\ + Bz1D [0:sizeofBz])\ + deviceptr(position_x) \ + copyin(accdx_inv[0:2]) //copyin(dx_inv_[:1]) //copyin(dx_inv_) + #pragma acc loop gang worker vector +#endif + for( int ipart = first_index; ipart < last_index; ipart++ ) { + // Normalized particle position + //double xpn = position_x[ipart] * dx_inv_;//particles.position( 0, ipart )*dx_inv_; + const double xpn = position_x[ipart] * accdx_inv[0]; + // Calculate coeffs + int idx_p[1], idx_d[1]; + double delta_p[1]; + double coeffxp[3]; + double coeffxd[3]; + + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + + // Interpolation of Ex^(d) + ELoc[0*nparts+ipart] = compute( &coeffxd[0], Ex1D, idx_d[0] ); + // Interpolation of Ey^(p) + ELoc[1*nparts+ipart] = compute( &coeffxp[0], Ey1D, idx_p[0] ); + // Interpolation of Ez^(p) + ELoc[2*nparts+ipart] = compute( &coeffxp[0], Ez1D, idx_p[0] ); + // Interpolation of Bx^(p) + BLoc[0*nparts+ipart] = compute( &coeffxp[0], Bx1D, idx_p[0] ); + // Interpolation of By^(d) + BLoc[1*nparts+ipart] = compute( &coeffxd[0], By1D, idx_d[0] ); + // Interpolation of Bz^(d) + BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] ); + + //Buffering of iol and delta + iold[0*nparts+ipart] = idx_p[0]; + delta[0*nparts+ipart] = delta_p[0]; + + } // end ipart loop + #if defined(SMILEI_OPENACC_MODE) + #pragma acc exit data delete(this) + #endif + + } else { // with B-TIS3 interpolation + double *const __restrict__ BypartBTIS3 = smpi->dynamics_Bpart_yBTIS3[ithread].data(); + double *const __restrict__ BzpartBTIS3 = smpi->dynamics_Bpart_zBTIS3[ithread].data(); + const double *const __restrict__ By1D_mBTIS3 = static_cast( EMfields->By_mBTIS3 )->data(); + const double *const __restrict__ Bz1D_mBTIS3 = static_cast( EMfields->Bz_mBTIS3 )->data(); +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target map( to : i_domain_begin_) is_device_ptr ( position_x) + #pragma omp teams distribute parallel for +#elif defined(SMILEI_OPENACC_MODE) + #pragma acc enter data create(this) + #pragma acc update device(this) + size_t interpolation_range_size = ( last_index + 1 * nparts ) - first_index; + #pragma acc parallel present(ELoc [first_index:interpolation_range_size],\ + BLoc [first_index:interpolation_range_size],\ + BypartBTIS3 [first_index:interpolation_range_size],\ + BzpartBTIS3 [first_index:interpolation_range_size],\ + iold [first_index:interpolation_range_size],\ + delta [first_index:interpolation_range_size],\ + Ex1D [0:sizeofEx],\ + Ey1D [0:sizeofEy],\ + Ez1D [0:sizeofEz],\ + Bx1D [0:sizeofBx],\ + By1D [0:sizeofBy],\ + Bz1D [0:sizeofBz],\ + By1D_mBTIS3 [0:sizeofEz],\ + Bz1D_mBTIS3 [0:sizeofEy])\ + deviceptr(position_x) \ + copyin(d_inv_) + #pragma acc loop gang worker vector +#endif + + // would it be possile to just use another #pragma acc parallel present( + // for By1D_mBTIS3 [0:sizeofEz],\ Bz1D_mBTIS3 [0:sizeofEy])\ BypartBTIS3 [first_index:interpolation_range_size],\ + BzpartBTIS3 [first_index:interpolation_range_size],\ + // ? + + /* Field1D *By1D_mBTIS3 = static_cast( EMfields->By_mBTIS3 ); + Field1D *Bz1D_mBTIS3 = static_cast( EMfields->Bz_mBTIS3 ); + double *BypartBTIS3 = &( smpi->dynamics_Bpart_yBTIS3[ithread][0] ); + double *BzpartBTIS3 = &( smpi->dynamics_Bpart_zBTIS3[ithread][0] );*/ + for (int ipart=*istart; ipart < *iend; ipart++){ // Normalized particle position - double xpn = particles.position( 0, ipart )*dx_inv_; + double xpn = position_x[ipart] * dx_inv_;//particles.position( 0, ipart )*dx_inv_; // Calculate coeffs int idx_p[1], idx_d[1]; double delta_p[1]; double coeffxp[3]; double coeffxd[3]; + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); // Interpolation of Ex^(d) - *( Epart+0*nparts+ipart ) = compute( coeffxd, Ex1D, idx_d[0] ); + ELoc[0*nparts+ipart] = compute( coeffxd, Ex1D, idx_d[0] ); // Interpolation of Ey^(p) - *( Epart+1*nparts+ipart ) = compute( coeffxp, Ey1D, idx_p[0] ); + ELoc[1*nparts+ipart] = compute( coeffxp, Ey1D, idx_p[0] ); // Interpolation of Ez^(p) - *( Epart+2*nparts+ipart ) = compute( coeffxp, Ez1D, idx_p[0] ); + ELoc[2*nparts+ipart] = compute( coeffxp, Ez1D, idx_p[0] ); // Interpolation of Bx^(p) - *( Bpart+0*nparts+ipart ) = compute( coeffxp, Bx1D, idx_p[0] ); + BLoc[0*nparts+ipart] = compute( coeffxp, Bx1D, idx_p[0] ); // Interpolation of By^(d) - *( Bpart+1*nparts+ipart ) = compute( coeffxd, By1D, idx_d[0] ); + BLoc[1*nparts+ipart] = compute( coeffxd, By1D, idx_d[0] ); // Interpolation of Bz^(d) - *( Bpart+2*nparts+ipart ) = compute( coeffxd, Bz1D, idx_d[0] ); + BLoc[2*nparts+ipart] = compute( coeffxd, Bz1D, idx_d[0] ); + // Interpolation of ByBTIS3^(p) + BypartBTIS3[0*nparts+ipart ] = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); + // Interpolation of BzBTIS3^(p) + BzpartBTIS3[0*nparts+ipart ] = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); + //Buffering of iol and delta - *( iold+0*nparts+ipart) = idx_p[0]; - *( delta+0*nparts+ipart) = delta_p[0]; + iold[0*nparts+ipart] = idx_p[0]; + delta[0*nparts+ipart] = delta_p[0]; } // end ipart loop - } else { // with B-TIS3 interpolation - - Field1D *By1D_mBTIS3 = static_cast( EMfields->By_mBTIS3 ); - Field1D *Bz1D_mBTIS3 = static_cast( EMfields->Bz_mBTIS3 ); - double *BypartBTIS3 = &( smpi->dynamics_Bpart_yBTIS3[ithread][0] ); - double *BzpartBTIS3 = &( smpi->dynamics_Bpart_zBTIS3[ithread][0] ); - - for (int ipart=*istart; ipart < *iend; ipart++){ - - // Normalized particle position - double xpn = particles.position( 0, ipart )*dx_inv_; - - // Calculate coeffs - int idx_p[1], idx_d[1]; - double delta_p[1]; - double coeffxp[3]; - double coeffxd[3]; - - coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); - - // Interpolation of Ex^(d) - *( Epart+0*nparts+ipart ) = compute( coeffxd, Ex1D, idx_d[0] ); - // Interpolation of Ey^(p) - *( Epart+1*nparts+ipart ) = compute( coeffxp, Ey1D, idx_p[0] ); - // Interpolation of Ez^(p) - *( Epart+2*nparts+ipart ) = compute( coeffxp, Ez1D, idx_p[0] ); - // Interpolation of Bx^(p) - *( Bpart+0*nparts+ipart ) = compute( coeffxp, Bx1D, idx_p[0] ); - // Interpolation of By^(d) - *( Bpart+1*nparts+ipart ) = compute( coeffxd, By1D, idx_d[0] ); - // Interpolation of Bz^(d) - *( Bpart+2*nparts+ipart ) = compute( coeffxd, Bz1D, idx_d[0] ); - // Interpolation of ByBTIS3^(p) - *( BypartBTIS3+0*nparts ) = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); - // Interpolation of BzBTIS3^(p) - *( BzpartBTIS3+0*nparts ) = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); - - //Buffering of iol and delta - *( iold+0*nparts+ipart) = idx_p[0]; - *( delta+0*nparts+ipart) = delta_p[0]; - - } // end ipart loop - + #if defined(SMILEI_OPENACC_MODE) + #pragma acc exit data delete(this) + #endif + } // end with B-TIS interpolation + + /*{ + EMfields->Ex_->copyFromDeviceToHost(); + EMfields->Ey_->copyFromDeviceToHost(); + EMfields->Ez_->copyFromDeviceToHost(); } + double *const __restrict__ ELoc = smpi->dynamics_Epart[ithread].data();//&( smpi->dynamics_Epart[ithread][0] ); + double *const __restrict__ BLoc = smpi->dynamics_Bpart[ithread].data();//&( smpi->dynamics_Bpart[ithread][0] ); +*/ + } + // to be deleted + { + const int nparts = particles.numberOfParticles(); + double *const __restrict__ ELoc = smpi->dynamics_Epart[ithread].data();//&( smpi->dynamics_Epart[ithread][0] ); + double *const __restrict__ BLoc = smpi->dynamics_Bpart[ithread].data();//&( smpi->dynamics_Bpart[ithread][0] ); + std::cout<< std::setprecision (15)<<"print in interpolator fields wrapper eloc before CopyDeviceToHost"<dynamics_Epart[ithread] )[0*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Epart[ithread] )[1*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Epart[ithread] )[2*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[0*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[1*nparts] ), nparts ); + smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[2*nparts] ), nparts ); + + } + std::cout<<"print in interpolator fields wrapper eloc after CopyDeviceToHost"< *selection ) override final; void oneField( Field **field, Particles &particles, int *istart, int *iend, double *FieldLoc, double *l1=NULL, double *l2=NULL, double *l3=NULL ) override final; - inline double __attribute__((always_inline)) compute( double *coeff, Field1D *f, int idx ) + inline double __attribute__((always_inline)) + compute( double *coeff, Field1D *f, int idx ) { double interp_res = coeff[0] * ( *f )( idx-1 ) + coeff[1] * ( *f )( idx ) + coeff[2] * ( *f )( idx+1 ); return interp_res; - }; + } + + SMILEI_ACCELERATOR_DECLARE_ROUTINE + static inline double __attribute__((always_inline)) + compute( const double *__restrict__ coeff, + const double *__restrict__ f, + int idx ) + { + double interp_res = coeff[0] * f[idx-1] + coeff[1] * f[idx] + coeff[2] * f[idx+1]; + return interp_res; + } + SMILEI_ACCELERATOR_DECLARE_ROUTINE_END void fieldsAndEnvelope( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, int ipart_ref = 0 ) override final; void timeCenteredEnvelope( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, int ipart_ref = 0 ) override final; @@ -34,38 +48,82 @@ class Interpolator1D2Order final : public Interpolator1D void envelopeFieldForIonization( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, int ipart_ref = 0 ) override final; private: - inline void coeffs( double xpn, int* idx_p, int* idx_d, - double *coeffxp, double *coeffxd, double* delta_p ) + inline void __attribute__((always_inline)) coeffs( double xjn ) + { + double xjmxi2; + + // Dual + id_ = std::round( xjn + 0.5 ); // index of the central point + xjmxi = xjn - static_cast(id_) + 0.5; // normalized distance to the central node + xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the central node + + // 2nd order interpolation on 3 nodes + coeffd_[0] = 0.5 * ( xjmxi2-xjmxi + 0.25 ); + coeffd_[1] = ( 0.75 - xjmxi2 ); + coeffd_[2] = 0.5 * ( xjmxi2+xjmxi + 0.25 ); + + id_ -= i_domain_begin_; + + // Primal + ip_ = std::round( xjn ); // index of the central point + xjmxi = xjn - static_cast(ip_); // normalized distance to the central node + xjmxi2 = xjmxi * xjmxi; // square of the normalized distance to the central node + + // 2nd order interpolation on 3 nodes + coeffp_[0] = 0.5 * ( xjmxi2 - xjmxi + 0.25 ); + coeffp_[1] = ( 0.75 - xjmxi2 ); + coeffp_[2] = 0.5 * ( xjmxi2 + xjmxi + 0.25 ); + + ip_ -= i_domain_begin_; + } + + // 2nd order interpolation on 3 nodes + SMILEI_ACCELERATOR_DECLARE_ROUTINE + inline void __attribute__( ( always_inline ) ) + coeffs( double xpn, int* idx_p, int* idx_d, + double *coeffxp, double *coeffxd, double* delta_p ) const { double delta, delta2; - // Primal - idx_p[0] = round( xpn ); // index of the central point - delta_p[0] = xpn -( double )idx_p[0]; // normalized distance to the central node - delta2 = pow( delta_p[0], 2 ); // square of the normalized distance to the central node + // index of the central point + idx_p[0] = std::round( xpn ); + idx_d[0] = std::round( xpn + 0.5 ); + + + delta = xpn - static_cast( idx_d[0] ) + 0.5; // normalized distance to the central node + delta2 = delta * delta; // square of the normalized distance to the central node - // 2nd order interpolation on 3 nodes - coeffxp[0] = 0.5 * ( delta2-delta_p[0]+0.25 ); - coeffxp[1] = ( 0.75-delta2 ); - coeffxp[2] = 0.5 * ( delta2+delta_p[0]+0.25 ); + coeffxd[0] = 0.5 * ( delta2 - delta + 0.25 ); + coeffxd[1] = ( 0.75 - delta2 ); + coeffxd[2] = 0.5 * ( delta2 + delta + 0.25 ); + + + delta = xpn - static_cast( idx_p[0] ); + delta2 = delta * delta; // pow( delta_p[0], 2 ); // square of the normalized distance to the central node + + coeffxp[0] = 0.5 * ( delta2 - delta_p[0] + 0.25 ); + coeffxp[1] = ( 0.75 - delta2 ); + coeffxp[2] = 0.5 * ( delta2 + delta_p[0] + 0.25 ); - idx_p[0] -= index_domain_begin; - - if(idx_d){ - // Dual - idx_d[0] = round( xpn+0.5 ); // index of the central point - delta = xpn - ( double )idx_d[0] +0.5; // normalized distance to the central node - delta2 = delta*delta; // square of the normalized distance to the central node - - // 2nd order interpolation on 3 nodes - coeffxd[0] = 0.5 * ( delta2-delta+0.25 ); - coeffxd[1] = ( 0.75-delta2 ); - coeffxd[2] = 0.5 * ( delta2+delta+0.25 ); - - idx_d[0] -= index_domain_begin; - } + delta_p[0] = delta; // normalized distance to the central node + + + idx_p[0] = idx_p[0] - i_domain_begin_; + idx_d[0] = idx_d[0] - i_domain_begin_; } + SMILEI_ACCELERATOR_DECLARE_ROUTINE_END + // Last prim index computed + int ip_; + // Last dual index computed + int id_; + // Last delta computed + double xjmxi; + // Interpolation coefficient on Prim grid + double coeffp_[3]; + // Interpolation coefficient on Dual grid + double coeffd_[3]; + };//END class diff --git a/src/Interpolator/Interpolator1D2OrderV.cpp b/src/Interpolator/Interpolator1D2OrderV.cpp old mode 100644 new mode 100755 index 31c3b7d4c..2b99cc66b --- a/src/Interpolator/Interpolator1D2OrderV.cpp +++ b/src/Interpolator/Interpolator1D2OrderV.cpp @@ -176,7 +176,7 @@ void Interpolator1D2OrderV::fieldsWrapper( ElectroMagn *EMfields, Particles &par coeffd[1] = ( 0.75-xjmxi2 ); coeffd[2] = 0.5 * ( xjmxi2+xjmxi+0.25 ); - idx -= index_domain_begin; + idx -= i_domain_begin_; // Primal ipx = round( xjn ); // index of the central point @@ -188,7 +188,7 @@ void Interpolator1D2OrderV::fieldsWrapper( ElectroMagn *EMfields, Particles &par coeffp[1] = ( 0.75-xjmxi2 ); coeffp[2] = 0.5 * ( xjmxi2+xjmxi+0.25 ); - ipx -= index_domain_begin; + ipx -= i_domain_begin_; // // Interpolate the fields from the Dual grid : Ex, By, Bz Epart_x[ipart] = coeffd[0] * Ex[idx-1] + coeffd[1] * Ex[idx] + coeffd[2] * Ex[idx+1]; @@ -329,7 +329,7 @@ void Interpolator1D2OrderV::timeCenteredEnvelope( ElectroMagn *EMfields, Particl //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Phiold^(p) @@ -388,7 +388,7 @@ void Interpolator1D2OrderV::envelopeAndSusceptibility( ElectroMagn *EMfields, Pa //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Env_A_abs_^(p) @@ -441,7 +441,7 @@ void Interpolator1D2OrderV::envelopeFieldForIonization( ElectroMagn *EMfields, P //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // --------------------------------- // Interpolation of Env_E_abs^(p) diff --git a/src/Interpolator/Interpolator1D2OrderV.h b/src/Interpolator/Interpolator1D2OrderV.h old mode 100644 new mode 100755 index b7dce6588..7c72f9ca2 --- a/src/Interpolator/Interpolator1D2OrderV.h +++ b/src/Interpolator/Interpolator1D2OrderV.h @@ -48,7 +48,7 @@ class Interpolator1D2OrderV final : public Interpolator1D coeffd_[1] = ( 0.75-xjmxi2 ); coeffd_[2] = 0.5 * ( xjmxi2+xjmxi+0.25 ); - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = round( xjn ); // index of the central point @@ -60,7 +60,7 @@ class Interpolator1D2OrderV final : public Interpolator1D coeffp_[1] = ( 0.75-xjmxi2 ); coeffp_[2] = 0.5 * ( xjmxi2+xjmxi+0.25 ); - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } // Last prim index computed diff --git a/src/Interpolator/Interpolator1D3Order.h b/src/Interpolator/Interpolator1D3Order.h index e9c821925..3228ed39b 100755 --- a/src/Interpolator/Interpolator1D3Order.h +++ b/src/Interpolator/Interpolator1D3Order.h @@ -42,7 +42,7 @@ class Interpolator1D3Order final : public Interpolator1D coeffd_[2] = dble_1ov6 + 0.5*( xi+xi2-xi3 ); coeffd_[3] = xi3*dble_1ov6; - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = ( int )xjn; // index of the 2nd node @@ -56,7 +56,7 @@ class Interpolator1D3Order final : public Interpolator1D coeffp_[2] = dble_1ov6 + 0.5*( xi+xi2-xi3 ); coeffp_[3] = xi3*dble_1ov6; - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } inline void coeffs( double xpn, int* idx_p, int* idx_d, @@ -77,7 +77,7 @@ class Interpolator1D3Order final : public Interpolator1D coeffxd[2] = dble_1ov6 + 0.5*( xi+xi2-xi3 ); coeffxd[3] = xi3*dble_1ov6; - idx_d[0] -= index_domain_begin; + idx_d[0] -= i_domain_begin_; // Primal idx_p[0] = ( int )xpn; // index of the 2nd node @@ -92,7 +92,7 @@ class Interpolator1D3Order final : public Interpolator1D coeffxp[2] = dble_1ov6 + 0.5*( xi+xi2-xi3 ); coeffxp[3] = xi3*dble_1ov6; - idx_p[0] -= index_domain_begin; + idx_p[0] -= i_domain_begin_; } // Last prim index computed diff --git a/src/Interpolator/Interpolator1D4Order.h b/src/Interpolator/Interpolator1D4Order.h index f8bd48ee4..7bca2b949 100755 --- a/src/Interpolator/Interpolator1D4Order.h +++ b/src/Interpolator/Interpolator1D4Order.h @@ -33,12 +33,64 @@ class Interpolator1D4Order final : public Interpolator1D void envelopeAndSusceptibility( ElectroMagn *EMfields, Particles &particles, int ipart, double *Env_A_abs_Loc, double *Env_Chi_Loc, double *Env_E_abs_Loc, double *Env_Ex_abs_Loc ) override final; private: + inline void __attribute__((always_inline)) coeffs( double xjn ) + { + double xjmxi2, xjmxi3, xjmxi4; + + // Dual + id_ = round( xjn+0.5 ); // index of the central point + xjmxi = xjn -( double )id_+0.5; // normalized distance to the central node + xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the central node + xjmxi3 = xjmxi2*xjmxi; // cube of the normalized distance to the central node + xjmxi4 = xjmxi3*xjmxi; // 4th power of the normalized distance to the central node + + // coefficients for the 4th order interpolation on 5 nodes + coeffd_[0] = dble_1_ov_384 - dble_1_ov_48 * xjmxi + dble_1_ov_16 * xjmxi2 - dble_1_ov_12 * xjmxi3 + dble_1_ov_24 * xjmxi4; + coeffd_[1] = dble_19_ov_96 - dble_11_ov_24 * xjmxi + dble_1_ov_4 * xjmxi2 + dble_1_ov_6 * xjmxi3 - dble_1_ov_6 * xjmxi4; + coeffd_[2] = dble_115_ov_192 - dble_5_ov_8 * xjmxi2 + dble_1_ov_4 * xjmxi4; + coeffd_[3] = dble_19_ov_96 + dble_11_ov_24 * xjmxi + dble_1_ov_4 * xjmxi2 - dble_1_ov_6 * xjmxi3 - dble_1_ov_6 * xjmxi4; + coeffd_[4] = dble_1_ov_384 + dble_1_ov_48 * xjmxi + dble_1_ov_16 * xjmxi2 + dble_1_ov_12 * xjmxi3 + dble_1_ov_24 * xjmxi4; + + id_ -= i_domain_begin_; + + // Primal + ip_ = round( xjn ); // index of the central point + xjmxi = xjn -( double )ip_; // normalized distance to the central node + xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the central node + xjmxi3 = xjmxi2*xjmxi; // cube of the normalized distance to the central node + xjmxi4 = xjmxi3*xjmxi; // 4th power of the normalized distance to the central node + + // coefficients for the 4th order interpolation on 5 nodes + coeffp_[0] = dble_1_ov_384 - dble_1_ov_48 * xjmxi + dble_1_ov_16 * xjmxi2 - dble_1_ov_12 * xjmxi3 + dble_1_ov_24 * xjmxi4; + coeffp_[1] = dble_19_ov_96 - dble_11_ov_24 * xjmxi + dble_1_ov_4 * xjmxi2 + dble_1_ov_6 * xjmxi3 - dble_1_ov_6 * xjmxi4; + coeffp_[2] = dble_115_ov_192 - dble_5_ov_8 * xjmxi2 + dble_1_ov_4 * xjmxi4; + coeffp_[3] = dble_19_ov_96 + dble_11_ov_24 * xjmxi + dble_1_ov_4 * xjmxi2 - dble_1_ov_6 * xjmxi3 - dble_1_ov_6 * xjmxi4; + coeffp_[4] = dble_1_ov_384 + dble_1_ov_48 * xjmxi + dble_1_ov_16 * xjmxi2 + dble_1_ov_12 * xjmxi3 + dble_1_ov_24 * xjmxi4; + + ip_ -= i_domain_begin_; + } + inline void coeffs( double xpn, int* idx_p, int* idx_d, double *coeffxp, double *coeffxd, double* delta_p ) { double delta, delta2, delta3, delta4 ; - - + + // Dual + idx_d[0] = round( xpn+0.5 ); // index of the central point + delta = xpn -( double )idx_d[0]+0.5; // normalized distance to the central node + delta2 = delta*delta; // square of the normalized distance to the central node + delta3 = delta2*delta; // cube of the normalized distance to the central node + delta4 = delta3*delta; // 4th power of the normalized distance to the central node + + // coefficients for the 4th order interpolation on 5 nodes + coeffxd[0] = dble_1_ov_384 - dble_1_ov_48 * delta + dble_1_ov_16 * delta2 - dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; + coeffxd[1] = dble_19_ov_96 - dble_11_ov_24 * delta + dble_1_ov_4 * delta2 + dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; + coeffxd[2] = dble_115_ov_192 - dble_5_ov_8 * delta2 + dble_1_ov_4 * delta4; + coeffxd[3] = dble_19_ov_96 + dble_11_ov_24 * delta + dble_1_ov_4 * delta2 - dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; + coeffxd[4] = dble_1_ov_384 + dble_1_ov_48 * delta + dble_1_ov_16 * delta2 + dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; + + idx_d[0] -= i_domain_begin_; + // Primal idx_p[0] = round( xpn ); // index of the central point delta_p[0] = xpn -( double )idx_p[0]; // normalized distance to the central node @@ -53,25 +105,7 @@ class Interpolator1D4Order final : public Interpolator1D coeffxp[3] = dble_19_ov_96 + dble_11_ov_24 * delta_p[0] + dble_1_ov_4 * delta2 - dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; coeffxp[4] = dble_1_ov_384 + dble_1_ov_48 * delta_p[0] + dble_1_ov_16 * delta2 + dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; - idx_p[0] -= index_domain_begin; - - if(idx_d){ - // Dual - idx_d[0] = round( xpn+0.5 ); // index of the central point - delta = xpn -( double )idx_d[0]+0.5; // normalized distance to the central node - delta2 = delta*delta; // square of the normalized distance to the central node - delta3 = delta2*delta; // cube of the normalized distance to the central node - delta4 = delta3*delta; // 4th power of the normalized distance to the central node - - // coefficients for the 4th order interpolation on 5 nodes - coeffxd[0] = dble_1_ov_384 - dble_1_ov_48 * delta + dble_1_ov_16 * delta2 - dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; - coeffxd[1] = dble_19_ov_96 - dble_11_ov_24 * delta + dble_1_ov_4 * delta2 + dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; - coeffxd[2] = dble_115_ov_192 - dble_5_ov_8 * delta2 + dble_1_ov_4 * delta4; - coeffxd[3] = dble_19_ov_96 + dble_11_ov_24 * delta + dble_1_ov_4 * delta2 - dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; - coeffxd[4] = dble_1_ov_384 + dble_1_ov_48 * delta + dble_1_ov_16 * delta2 + dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; - - idx_d[0] -= index_domain_begin; - } + idx_p[0] -= i_domain_begin_; } double dble_1_ov_384 ; @@ -86,6 +120,18 @@ class Interpolator1D4Order final : public Interpolator1D double dble_115_ov_192 ; double dble_5_ov_8 ; + // Last prim index computed + int ip_; + // Last dual index computed + int id_; + // Last delta computed + double xjmxi; + // Interpolation coefficient on Prim grid + double coeffp_[5]; + // Interpolation coefficient on Dual grid + double coeffd_[5]; + + };//END class #endif diff --git a/src/Interpolator/Interpolator1DWT2Order.cpp b/src/Interpolator/Interpolator1DWT2Order.cpp index 2ba3881b5..4bc058096 100755 --- a/src/Interpolator/Interpolator1DWT2Order.cpp +++ b/src/Interpolator/Interpolator1DWT2Order.cpp @@ -239,7 +239,7 @@ void Interpolator1DWT2Order::timeCenteredEnvelope( ElectroMagn *EMfields, Partic //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Phiold^(p) @@ -298,7 +298,7 @@ void Interpolator1DWT2Order::envelopeAndSusceptibility( ElectroMagn *EMfields, P //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Env_A_abs_^(p) @@ -351,7 +351,7 @@ void Interpolator1DWT2Order::envelopeFieldForIonization( ElectroMagn *EMfields, //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // --------------------------------- // Interpolation of Env_E_abs^(p) diff --git a/src/Interpolator/Interpolator1DWT2Order.h b/src/Interpolator/Interpolator1DWT2Order.h index 19ea0ee7d..ff45230cf 100755 --- a/src/Interpolator/Interpolator1DWT2Order.h +++ b/src/Interpolator/Interpolator1DWT2Order.h @@ -47,7 +47,7 @@ class Interpolator1DWT2Order final : public Interpolator1D coeffd_[1] = ( 0.75-var1 ); coeffd_[2] = 0.5 * ( var1+xjmxi+0.25 ); - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = round( xjn ); // index of the central point @@ -65,7 +65,7 @@ class Interpolator1DWT2Order final : public Interpolator1D coeffpt_[1] = 1.0 - 2.0 * var1; coeffpt_[2] = var1 + 0.5 * xjmxi; - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } // Coefficients for WT diff --git a/src/Interpolator/Interpolator1DWT2OrderV.cpp b/src/Interpolator/Interpolator1DWT2OrderV.cpp index c64433035..40dd63589 100755 --- a/src/Interpolator/Interpolator1DWT2OrderV.cpp +++ b/src/Interpolator/Interpolator1DWT2OrderV.cpp @@ -178,7 +178,7 @@ void Interpolator1DWT2OrderV::fieldsWrapper( ElectroMagn *EMfields, Particles &p coeffd[1] = ( 0.75-var1 ); coeffd[2] = 0.5 * ( var1+xjmxi+0.25 ); - idx -= index_domain_begin; + idx -= i_domain_begin_; // Primal ipx = round( xjn ); // index of the central point @@ -190,7 +190,7 @@ void Interpolator1DWT2OrderV::fieldsWrapper( ElectroMagn *EMfields, Particles &p coeffpt[1] = 1.0 - 2.0 * var1; coeffpt[2] = var1 + 0.5 * xjmxi; - ipx -= index_domain_begin; + ipx -= i_domain_begin_; // // Interpolate the fields from the Dual grid : Ex, By, Bz Epart_x[ipart] = coeffd[0] * Ex[idx-1] + coeffd[1] * Ex[idx] + coeffd[2] * Ex[idx+1]; @@ -331,7 +331,7 @@ void Interpolator1DWT2OrderV::timeCenteredEnvelope( ElectroMagn *EMfields, Parti //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Phiold^(p) @@ -390,7 +390,7 @@ void Interpolator1DWT2OrderV::envelopeAndSusceptibility( ElectroMagn *EMfields, //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Env_A_abs_^(p) @@ -443,7 +443,7 @@ void Interpolator1DWT2OrderV::envelopeFieldForIonization( ElectroMagn *EMfields, //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // --------------------------------- // Interpolation of Env_E_abs^(p) diff --git a/src/Interpolator/Interpolator1DWT2OrderV.h b/src/Interpolator/Interpolator1DWT2OrderV.h index 87a083fa5..4f20849c1 100755 --- a/src/Interpolator/Interpolator1DWT2OrderV.h +++ b/src/Interpolator/Interpolator1DWT2OrderV.h @@ -48,7 +48,7 @@ class Interpolator1DWT2OrderV final : public Interpolator1D coeffd_[1] = ( 0.75-var1 ); coeffd_[2] = 0.5 * ( var1+xjmxi+0.25 ); - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = round( xjn ); // index of the central point @@ -66,7 +66,7 @@ class Interpolator1DWT2OrderV final : public Interpolator1D coeffpt_[1] = 1.0 - 2.0 * var1; coeffpt_[2] = var1 + 0.5 * xjmxi; - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } // Coefficients for WT diff --git a/src/Interpolator/Interpolator1DWT4Order.h b/src/Interpolator/Interpolator1DWT4Order.h index dd5e78b13..6bc889885 100755 --- a/src/Interpolator/Interpolator1DWT4Order.h +++ b/src/Interpolator/Interpolator1DWT4Order.h @@ -55,7 +55,7 @@ class Interpolator1DWT4Order final : public Interpolator1D coeffd_[3] = dble_19_ov_96 + var1 + var3 * ( 1.5-xjmxi -var2 ); coeffd_[4] = dble_1_ov_24 * var5 * var5; - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = round( xjn ); // index of the central point @@ -94,7 +94,7 @@ class Interpolator1DWT4Order final : public Interpolator1D coeffpt_[4] = var3 + var2 - var1; - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } double dble_1_ov_6 ; diff --git a/src/Interpolator/InterpolatorFactory.h b/src/Interpolator/InterpolatorFactory.h index f2cbd7c19..37e1042fb 100755 --- a/src/Interpolator/InterpolatorFactory.h +++ b/src/Interpolator/InterpolatorFactory.h @@ -48,12 +48,22 @@ class InterpolatorFactory // 1Dcartesian simulation // --------------- if( ( params.geometry == "1Dcartesian" ) && ( params.interpolation_order == 2 ) ) { + if( !vectorization ) { + if ( params.interpolator_ == "momentum-conserving" ) { + Interp = new Interpolator1D2Order( params, patch ); + } + else if ( params.interpolator_ == "wt" ) { + Interp = new Interpolator1DWT2Order( params, patch ); + } + } + else { if ( params.interpolator_ == "momentum-conserving" ) { Interp = new Interpolator1D2OrderV( params, patch ); } else if ( params.interpolator_ == "wt" ) { Interp = new Interpolator1DWT2OrderV( params, patch ); } + } } else if( ( params.geometry == "1Dcartesian" ) && ( params.interpolation_order == 4 ) ) { if( params.interpolator_ == "momentum-conserving" ) { Interp = new Interpolator1D4Order( params, patch ); diff --git a/src/Params/Params.h b/src/Params/Params.h index e2b0603e6..41896f7e9 100755 --- a/src/Params/Params.h +++ b/src/Params/Params.h @@ -407,7 +407,7 @@ class Params //#if defined( SMILEI_ACCELERATOR_GPU_OMP ) switch( dimension_id ) { case 1: - return -1; + return 4; // check for optimal value case 2: return 4; case 3: diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu old mode 100644 new mode 100755 index d7a63f0b3..16941b152 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -12,10 +12,9 @@ #include #include #include -#include -#include -#include - +#include // pour thrust::remove_if +#include // pour thrust::sort_by_key +#include // pour thrust::count_if #include "Patch.h" #include "gpu.h" @@ -125,11 +124,46 @@ namespace detail { ParticleNoKeyIteratorProvider particle_no_key_iterator_provider ); }; + template + struct Cluster1D : public Cluster + { + public: + Cluster1D( double inverse_x_cell_dimension, + SizeType local_x_dimension_in_cell, + int CellStartingGlobalIndex_for_x); + + //! Compute the cell key of a_particle. a_particle shall be a tuple (from a + //! zipiterator). + //! The first value of a_particle is the cell key value, the other values are + //! the positions x + template + __host__ __device__ IDType + Index( const Tuple& a_particle ) const; + + //! Compute the cell key of a particle range. + //! + static void + computeParticleClusterKey( nvidiaParticles& particle_container, + const Params& parameters, + const Patch& a_parent_patch ); + + static void + sortParticleByKey( nvidiaParticles& particle_container, + const Params& parameters ); + + static void + importAndSortParticles( nvidiaParticles& particle_container, + nvidiaParticles& particle_to_inject, + const Params& parameters, + const Patch& a_parent_patch ); + + double inverse_of_x_cell_dimension_; + int CellStartingGlobalIndex_for_x_; + }; template struct Cluster2D : public Cluster { - public: public: Cluster2D( double inverse_x_cell_dimension, double inverse_y_cell_dimension, @@ -164,18 +198,16 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ); - public: double inverse_of_x_cell_dimension_; double inverse_of_y_cell_dimension_; SizeType local_y_dimension_in_cluster_; - int CellStartingGlobalIndex_for_x_; + int CellStartingGlobalIndex_for_x_; int CellStartingGlobalIndex_for_y_; }; template struct Cluster3D : public Cluster { - public: public: Cluster3D( double inverse_x_cell_dimension, double inverse_y_cell_dimension, @@ -183,7 +215,7 @@ namespace detail { SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, SizeType local_z_dimension_in_cell, - int CellStartingGlobalIndex_for_x, + int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y, int CellStartingGlobalIndex_for_z); @@ -213,14 +245,13 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ); - public: double inverse_of_x_cell_dimension_; double inverse_of_y_cell_dimension_; double inverse_of_z_cell_dimension_; SizeType local_y_dimension_in_cluster_; SizeType local_z_dimension_in_cluster_; int CellStartingGlobalIndex_for_x_; - int CellStartingGlobalIndex_for_y_; + int CellStartingGlobalIndex_for_y_; int CellStartingGlobalIndex_for_z_; }; @@ -230,19 +261,17 @@ namespace detail { template class AssignClusterIndex { - public: public: AssignClusterIndex( ClusterType cluster_type ) : cluster_type_{ cluster_type } { - // EMPTY } template __host__ __device__ void operator()( Tuple& a_particle ) const { - thrust::get<0>( a_particle ) /* cluster key */ = cluster_type_.Index( a_particle ); + thrust::get<0>( a_particle ) = cluster_type_.Index( a_particle ); //cluster key } protected: @@ -251,16 +280,13 @@ namespace detail { //! This functor assign a cluster key to a_particle. - //! template struct OutOfClusterPredicate { - public: public: OutOfClusterPredicate( ClusterType cluster_type ) : cluster_type_{ cluster_type } { - // EMPTY } template @@ -269,7 +295,7 @@ namespace detail { { // NOTE: its ub to set the cluster key to wrongly keyed particles // now.. - return thrust::get<0>( a_particle ) /* cluster key */ != cluster_type_.Index( a_particle ); + return thrust::get<0>( a_particle ) != cluster_type_.Index( a_particle );//cluster key } protected: @@ -286,7 +312,7 @@ namespace detail { __host__ __device__ bool operator()( const Tuple& a_particle ) const { - return thrust::get<0>( a_particle ) /* cluster key */ == -1; + return thrust::get<0>( a_particle ) == -1;//cluster key } }; @@ -304,6 +330,12 @@ namespace detail { // dimensions. switch( particle_container.dimension() ) { + case 1: { + Cluster1D::computeParticleClusterKey( particle_container, + parameters, + a_parent_patch ); + break; + } case 2: { Cluster2D::computeParticleClusterKey( particle_container, parameters, @@ -317,7 +349,7 @@ namespace detail { break; } default: - // Not implemented, only Cartesian 2D or 3D for the moment + // Not implemented, only Cartesian 1D, 2D or 3D for the moment SMILEI_ASSERT( false ); break; } @@ -331,6 +363,11 @@ namespace detail { // dimensions. switch( particle_container.dimension() ) { + case 1: { + Cluster1D::sortParticleByKey( particle_container, + parameters ); + break; + } case 2: { Cluster2D::sortParticleByKey( particle_container, parameters ); @@ -342,7 +379,7 @@ namespace detail { break; } default: - // Not implemented, only Cartesian 2D or 3D for the moment + // Not implemented, only Cartesian 1D, 2D or 3D for the moment SMILEI_ASSERT( false ); break; } @@ -392,15 +429,22 @@ namespace detail { // dimensions. switch( particle_container.dimension() ) { + case 1: { + Cluster1D::importAndSortParticles( particle_container, + particle_to_inject, + parameters, + a_parent_patch ); + break; + } case 2: { - Cluster2D::importAndSortParticles( particle_container, + Cluster2D::importAndSortParticles( particle_container, particle_to_inject, parameters, a_parent_patch ); break; } case 3: { - Cluster3D::importAndSortParticles( particle_container, + Cluster3D::importAndSortParticles( particle_container, particle_to_inject, parameters, a_parent_patch ); @@ -408,7 +452,7 @@ namespace detail { } default: - // Not implemented, only 2D for the moment + // Not implemented, only Cartesian 1D, 2D or 3D for the moment SMILEI_ASSERT( false ); break; } @@ -546,9 +590,18 @@ namespace detail { //////////////////////////////////////////////////////////////////////////////// - // Cluster2D method definitions + // Cluster method definitions //////////////////////////////////////////////////////////////////////////////// + template + Cluster1D::Cluster1D( double inverse_x_cell_dimension, + SizeType local_x_dimension_in_cell, + int CellStartingGlobalIndex_for_x) + : inverse_of_x_cell_dimension_{ inverse_x_cell_dimension } + , CellStartingGlobalIndex_for_x_{CellStartingGlobalIndex_for_x} + { + } + template Cluster2D::Cluster2D( double inverse_x_cell_dimension, double inverse_y_cell_dimension, @@ -561,7 +614,6 @@ namespace detail { , CellStartingGlobalIndex_for_x_{CellStartingGlobalIndex_for_x} , CellStartingGlobalIndex_for_y_{CellStartingGlobalIndex_for_y} { - // EMPTY } template @@ -571,7 +623,7 @@ namespace detail { SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, SizeType local_z_dimension_in_cell, - int CellStartingGlobalIndex_for_x, + int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y, int CellStartingGlobalIndex_for_z ) : inverse_of_x_cell_dimension_{ inverse_x_cell_dimension } , inverse_of_y_cell_dimension_{ inverse_y_cell_dimension } @@ -582,7 +634,30 @@ namespace detail { , CellStartingGlobalIndex_for_y_{CellStartingGlobalIndex_for_y} , CellStartingGlobalIndex_for_z_{CellStartingGlobalIndex_for_z} { - // EMPTY + } + + template + template + __host__ __device__ typename Cluster1D::IDType + Cluster1D::Index( const Tuple& a_particle ) const + { + const SizeType local_x_particle_coordinate_in_cell = static_cast( thrust::get<1>( a_particle ) * + inverse_of_x_cell_dimension_ ) - + CellStartingGlobalIndex_for_x_; + + // These divisions will be optimized. + // The integer division rounding behavior is expected. + + // NOTE: Flat tiles have been studied but were not as efficient for the + // projection. The square provides the minimal perimeter (and thus ghost + // cell amount) for a given area. + static constexpr SizeType x_cluster_dimension_in_cell = kClusterWidth; + + const SizeType local_x_particle_cluster_coordinate_in_cluster = local_x_particle_coordinate_in_cell / x_cluster_dimension_in_cell; + + const SizeType cluster_index = local_x_particle_cluster_coordinate_in_cluster; + + return static_cast( cluster_index ); } template @@ -658,6 +733,23 @@ namespace detail { return static_cast( cluster_index ); } + template + void + Cluster1D::computeParticleClusterKey( nvidiaParticles& particle_container, + const Params& parameters, + const Patch& a_parent_patch ) + { + const auto first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrCellKeys(), + static_cast( particle_container.getPtrPosition( 0 ) ) ) ); + const auto last = first + particle_container.deviceSize(); + int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); + printf ( "CellStartingGlobalIndex_for_x %d res %f patch size %d \n",CellStartingGlobalIndex_for_x,parameters.res_space[0], parameters.patch_size_[0] ); + doComputeParticleClusterKey( first, last, + Cluster1D{ parameters.res_space[0], + parameters.patch_size_[0], + CellStartingGlobalIndex_for_x} ); + } + template void Cluster2D::computeParticleClusterKey( nvidiaParticles& particle_container, @@ -670,7 +762,7 @@ namespace detail { const auto last = first + particle_container.deviceSize(); int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); - doComputeParticleClusterKey( first, last, + doComputeParticleClusterKey( first, last, Cluster2D{ parameters.res_space[0], parameters.res_space[1], parameters.patch_size_[0], @@ -693,7 +785,7 @@ namespace detail { int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); int CellStartingGlobalIndex_for_z = a_parent_patch.getCellStartingGlobalIndex_noGC(2); - doComputeParticleClusterKey( first, last, + doComputeParticleClusterKey( first, last, Cluster3D{ parameters.res_space[0], parameters.res_space[1], parameters.res_space[2], @@ -705,6 +797,51 @@ namespace detail { CellStartingGlobalIndex_for_z } ); } + template + void + Cluster1D::sortParticleByKey( nvidiaParticles& particle_container, + const Params& ) + { + + if( particle_container.has_quantum_parameter ) { + if( particle_container.has_Monte_Carlo_process ) { + SMILEI_ASSERT( false ); + } else { + SMILEI_ASSERT( false ); + } + } else { + if( particle_container.has_Monte_Carlo_process ) { + SMILEI_ASSERT( false ); + } else { + // The appropriate thrust::zip_iterator for the current + // simulation's parameters + + if (particle_container.tracked) { + const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), + particle_container.getPtrMomentum( 0 ), + particle_container.getPtrMomentum( 1 ), + particle_container.getPtrMomentum( 2 ), + particle_container.getPtrWeight(), + particle_container.getPtrCharge(), + particle_container.getPtrId() ) ); + doSortParticleByKey( particle_container.getPtrCellKeys(), + particle_container.getPtrCellKeys() + particle_container.deviceSize(), + value_first ); + } + else { + const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), + particle_container.getPtrMomentum( 0 ), + particle_container.getPtrMomentum( 1 ), + particle_container.getPtrMomentum( 2 ), + particle_container.getPtrWeight(), + particle_container.getPtrCharge() ) ); + doSortParticleByKey( particle_container.getPtrCellKeys(), + particle_container.getPtrCellKeys() + particle_container.deviceSize(), + value_first ); + } + } + } + } template void Cluster2D::sortParticleByKey( nvidiaParticles& particle_container, @@ -732,17 +869,30 @@ namespace detail { // The appropriate thrust::zip_iterator for the current // simulation's parameters - const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - - doSortParticleByKey( particle_container.getPtrCellKeys(), - particle_container.getPtrCellKeys() + particle_container.deviceSize(), - value_first ); + if (particle_container.tracked) { + const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), + particle_container.getPtrPosition( 1 ), + particle_container.getPtrMomentum( 0 ), + particle_container.getPtrMomentum( 1 ), + particle_container.getPtrMomentum( 2 ), + particle_container.getPtrWeight(), + particle_container.getPtrCharge() ) ); + doSortParticleByKey( particle_container.getPtrCellKeys(), + particle_container.getPtrCellKeys() + particle_container.deviceSize(), + value_first ); + } + else { + const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), + particle_container.getPtrPosition( 1 ), + particle_container.getPtrMomentum( 0 ), + particle_container.getPtrMomentum( 1 ), + particle_container.getPtrMomentum( 2 ), + particle_container.getPtrWeight(), + particle_container.getPtrCharge() ) ); + doSortParticleByKey( particle_container.getPtrCellKeys(), + particle_container.getPtrCellKeys() + particle_container.deviceSize(), + value_first ); + } } } } @@ -805,6 +955,67 @@ namespace detail { } } } + template + void + Cluster1D::importAndSortParticles( nvidiaParticles& particle_container, + nvidiaParticles& particle_to_inject, + const Params& parameters, + const Patch& a_parent_patch ) + { + // This is where we do a runtime dispatch depending on the simulation's + // qed/radiation settings. + + // NOTE: For now we support dont support qed/radiations. Performance + // comes from specialization. + + // TODO(Etienne M): Find a better way to dispatch at runtime. This is + // complex to read and to maintain. + int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); + printf("CellStartingGlobalIndex_for_x %d \n" , CellStartingGlobalIndex_for_x ); + + const Cluster1D cluster_manipulator{ parameters.res_space[0], + parameters.patch_size_[0], + CellStartingGlobalIndex_for_x}; + + if( particle_container.has_quantum_parameter ) { + if( particle_container.has_Monte_Carlo_process ) { + SMILEI_ASSERT( false ); + } else { + SMILEI_ASSERT( false ); + } + } else { + if( particle_container.has_Monte_Carlo_process ) { + SMILEI_ASSERT( false ); + } else { + // Returns the appropriate thrust::zip_iterator for the + // current simulation's parameters + const auto particle_iterator_provider = []( nvidiaParticles& particle_container ) { + return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrCellKeys(), + particle_container.getPtrPosition( 0 ), + particle_container.getPtrMomentum( 0 ), + particle_container.getPtrMomentum( 1 ), + particle_container.getPtrMomentum( 2 ), + particle_container.getPtrWeight(), + particle_container.getPtrCharge() ) ); + }; + + const auto particle_no_key_iterator_provider = []( nvidiaParticles& particle_container ) { + return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), + particle_container.getPtrMomentum( 0 ), + particle_container.getPtrMomentum( 1 ), + particle_container.getPtrMomentum( 2 ), + particle_container.getPtrWeight(), + particle_container.getPtrCharge() ) ); + }; + + doImportAndSortParticles( particle_container, + particle_to_inject, + cluster_manipulator, + particle_iterator_provider, + particle_no_key_iterator_provider ); + } + } + } template void @@ -824,6 +1035,7 @@ namespace detail { int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); + printf("CellStartingGlobalIndex_for_x %d \n" , CellStartingGlobalIndex_for_x ); const Cluster2D cluster_manipulator{ parameters.res_space[0], parameters.res_space[1], parameters.patch_size_[0], @@ -990,7 +1202,6 @@ nvidiaParticles::nvidiaParticles( const Params& parameters, , parent_patch_{ &a_parent_patch } , gpu_nparts_{} { - // EMPTY } nvidiaParticles::~nvidiaParticles() { @@ -1271,10 +1482,9 @@ void nvidiaParticles::initializeDataOnDevice() // setHostBinIndex(); } else { - + printf( " parent patch %p cells starting global index %d \n", parent_patch_, parent_patch_->getCellStartingGlobalIndex_noGC(0) ); // At this point, a copy of the host particles and last_index is on the // device and we know we support the space dimension. - detail::Cluster::computeParticleClusterKey( *this, *parameters_, *parent_patch_ ); // The particles are not correctly sorted when created. @@ -1380,13 +1590,13 @@ void nvidiaParticles::extractParticles( Particles* particles_to_move ) const int nparts = gpu_nparts_; const int position_dimension_count = nvidia_position_.size(); - const int nparts_to_move = thrust::count_if( thrust::device, + const int number_of_particules_to_move = thrust::count_if( thrust::device, nvidia_cell_keys_.cbegin(), nvidia_cell_keys_.cbegin() + nparts, count_if_out() ); // Resize it, if too small (copy_if do not resize) - cp_parts->resize( nparts_to_move ); + cp_parts->resize( number_of_particules_to_move ); // Iterator of the main data structure // NOTE: https://nvidia.github.io/thrust/api/classes/classthrust_1_1zip__iterator.html#class-thrustzip_iterator @@ -1404,7 +1614,7 @@ void nvidiaParticles::extractParticles( Particles* particles_to_move ) cp_parts->nvidia_weight_.begin(), cp_parts->nvidia_charge_.begin() ) ); - // Copy send particles in dedicated data structure if nvidia_cell_keys_=0 (currently = 1 if keeped, new PartBoundCond::apply(...)) + // Copy send particles in dedicated data structure if nvidia_cell_keys_=0 (currently = 1 if kept, new PartBoundCond::apply(...)) thrust::copy_if( thrust::device, source_iterator_first, source_iterator_last, @@ -1653,7 +1863,6 @@ void nvidiaParticles::createParticles( int n_additional_particles ) nvidia_id_.resize( new_size ); thrust::fill( nvidia_id_.begin() + n_particles, nvidia_id_.begin() + new_size, 0 ); } - nvidia_cell_keys_.resize( new_size ); thrust::fill( nvidia_cell_keys_.begin() + n_particles, nvidia_cell_keys_.begin() + new_size, -1 ); @@ -1747,6 +1956,9 @@ extern "C" { void* CreateGPUParticles( const void* parameters, const void* a_parent_patch ) { + const Patch *temp = static_cast( a_parent_patch ); + + printf( " in create GPU parent patch %p cells starting global index %d \n", a_parent_patch, temp->getCellStartingGlobalIndex_noGC(0) ); return new nvidiaParticles{ *static_cast( parameters ), *static_cast( a_parent_patch ) }; } diff --git a/src/Patch/SyncVectorPatch.cpp b/src/Patch/SyncVectorPatch.cpp index 09817b201..2982162de 100755 --- a/src/Patch/SyncVectorPatch.cpp +++ b/src/Patch/SyncVectorPatch.cpp @@ -48,7 +48,7 @@ void SyncVectorPatch::exchangeParticles( VectorPatch &vecPatches, int ispec, Par // --------------------------------------------------------------------------------------------------------------------- //! This function performs: -//! - the exhcange of particles for each direction using the diagonal trick. +//! - the exchange of particles for each direction using the diagonal trick. //! - the importation of the new particles in the particle property arrays //! - the sorting of particles // --------------------------------------------------------------------------------------------------------------------- diff --git a/src/Projector/Projector1D.h b/src/Projector/Projector1D.h index d51327bb7..c08c0e9a8 100755 --- a/src/Projector/Projector1D.h +++ b/src/Projector/Projector1D.h @@ -18,21 +18,19 @@ class Projector1D : public Projector virtual ~Projector1D() {}; virtual void mv_win( unsigned int shift ) { - index_domain_begin+=shift; + i_domain_begin_ += shift; } virtual void setMvWinLimits( unsigned int shift ) { - index_domain_begin = shift; + i_domain_begin_ = shift; } protected: //! Inverse of the spatial step 1/dx double dx_inv_; - int index_domain_begin; + double dx_ov_dt_; + int i_domain_begin_; double *Jx_, *Jy_, *Jz_, *rho_; - -private: - }; #endif diff --git a/src/Projector/Projector1D2Order.cpp b/src/Projector/Projector1D2Order.cpp index cd587dc71..451bca539 100755 --- a/src/Projector/Projector1D2Order.cpp +++ b/src/Projector/Projector1D2Order.cpp @@ -18,14 +18,12 @@ using namespace std; Projector1D2Order::Projector1D2Order( Params ¶ms, Patch *patch ) : Projector1D( params, patch ) { dx_inv_ = 1.0/params.cell_length[0]; - dx_ov_dt = params.cell_length[0] / params.timestep; + dx_ov_dt_ = params.cell_length[0] / params.timestep; - index_domain_begin = patch->getCellStartingGlobalIndex( 0 ); - - dt = params.timestep; - dts2 = params.timestep/2.; - dts4 = params.timestep/4.; + i_domain_begin_ = patch->getCellStartingGlobalIndex( 0 ); + dts2_ = params.timestep/2.; + dts4_ = params.timestep/4.; } @@ -43,7 +41,7 @@ void Projector1D2Order::currents( double *Jx, double *Jy, double *Jz, Particles int ip_m_ipo; double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); double xjn, xj_m_xipo, xj_m_xipo2, xj_m_xip, xj_m_xip2; - double crx_p = charge_weight*dx_ov_dt; // current density for particle moving in the x-direction + double crx_p = charge_weight*dx_ov_dt_; // current density for particle moving in the x-direction double cry_p = charge_weight*particles.momentum( 1, ipart )*invgf; // current density in the y-direction of the macroparticle double crz_p = charge_weight*particles.momentum( 2, ipart )*invgf; // current density allow the y-direction of the macroparticle double S0[5], S1[5], Wl[5], Wt[5], Jx_p[5]; // arrays used for the Esirkepov projection method @@ -76,7 +74,7 @@ void Projector1D2Order::currents( double *Jx, double *Jy, double *Jz, Particles // coefficients 2nd order interpolation on 3 nodes ipo = *iold; // index of the central node - ip_m_ipo = ip-ipo-index_domain_begin; + ip_m_ipo = ip-ipo-i_domain_begin_; S1[ip_m_ipo+1] = 0.5 * ( xj_m_xip2-xj_m_xip+0.25 ); S1[ip_m_ipo+2] = ( 0.75-xj_m_xip2 ); S1[ip_m_ipo+3] = 0.5 * ( xj_m_xip2+xj_m_xip+0.25 ); @@ -115,7 +113,7 @@ void Projector1D2Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, int ip_m_ipo; double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); double xjn, xj_m_xipo, xj_m_xipo2, xj_m_xip, xj_m_xip2; - double crx_p = charge_weight*dx_ov_dt; // current density for particle moving in the x-direction + double crx_p = charge_weight*dx_ov_dt_; // current density for particle moving in the x-direction double cry_p = charge_weight*particles.momentum( 1, ipart )*invgf; // current density in the y-direction of the macroparticle double crz_p = charge_weight*particles.momentum( 2, ipart )*invgf; // current density allow the y-direction of the macroparticle double S0[5], S1[5], Wl[5], Wt[5], Jx_p[5]; // arrays used for the Esirkepov projection method @@ -132,7 +130,7 @@ void Projector1D2Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, // Locate particle old position on the primal grid xj_m_xipo = *deltaold; // normalized distance to the nearest grid point - xj_m_xipo2 = xj_m_xipo*xj_m_xipo; // square of the normalized distance to the nearest grid point + xj_m_xipo2 = xj_m_xipo*xj_m_xipo; // square of the normalized distance to the nearest grid point // Locate particle new position on the primal grid xjn = particles.position( 0, ipart ) * dx_inv_; @@ -142,16 +140,16 @@ void Projector1D2Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, // coefficients 2nd order interpolation on 3 nodes - S0[1] = 0.5 * ( xj_m_xipo2-xj_m_xipo+0.25 ); - S0[2] = ( 0.75-xj_m_xipo2 ); - S0[3] = 0.5 * ( xj_m_xipo2+xj_m_xipo+0.25 ); + S0[1] = 0.5 * ( xj_m_xipo2 - xj_m_xipo + 0.25 ); + S0[2] = ( 0.75 - xj_m_xipo2 ); + S0[3] = 0.5 * ( xj_m_xipo2 + xj_m_xipo + 0.25 ); // coefficients 2nd order interpolation on 3 nodes ipo = *iold; - ip_m_ipo = ip-ipo-index_domain_begin; - S1[ip_m_ipo+1] = 0.5 * ( xj_m_xip2-xj_m_xip+0.25 ); - S1[ip_m_ipo+2] = ( 0.75-xj_m_xip2 ); - S1[ip_m_ipo+3] = 0.5 * ( xj_m_xip2+xj_m_xip+0.25 ); + ip_m_ipo = ip-ipo-i_domain_begin_; + S1[ip_m_ipo+1] = 0.5 * ( xj_m_xip2 - xj_m_xip + 0.25 ); + S1[ip_m_ipo+2] = ( 0.75 - xj_m_xip2 ); + S1[ip_m_ipo+3] = 0.5 * ( xj_m_xip2 + xj_m_xip + 0.25 ); // coefficients used in the Esirkepov method for( unsigned int i=0; i<5; i++ ) { @@ -228,7 +226,7 @@ void Projector1D2Order::basic( double *rhoj, Particles &particles, unsigned int S1[2] = ( 0.75-xj_m_xip2 ); S1[3] = 0.5 * ( xj_m_xip2+xj_m_xip+0.25 ); - ip -= index_domain_begin + 2 + bin_shift; + ip -= i_domain_begin_ + 2 + bin_shift; // 2nd order projection for charge density // At the 2nd order, oversize = 2. @@ -270,7 +268,7 @@ void Projector1D2Order::ionizationCurrents( Field *Jx, Field *Jy, Field *Jz, Par xjmxi = xjn - ( double )i + 0.5; // normalized distance to the nearest grid point xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the nearest grid point - i -= index_domain_begin; + i -= i_domain_begin_; im1 = i-1; ip1 = i+1; @@ -291,7 +289,7 @@ void Projector1D2Order::ionizationCurrents( Field *Jx, Field *Jy, Field *Jz, Par xjmxi = xjn - ( double )i; // normalized distance to the nearest grid point xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the nearest grid point - i -= index_domain_begin; + i -= i_domain_begin_; im1 = i-1; ip1 = i+1; @@ -377,9 +375,9 @@ void Projector1D2Order::susceptibility( ElectroMagn *EMfields, Particles &partic for( int ipart=istart ; ipartJx_/Jy_/Jz_) inline void currents( double *Jx, double *Jy, double *Jz, Particles &particles, unsigned int ipart, double invgf, int *iold, double *deltaold, int bin_shift = 0 ); //! Project global current densities (EMfields->Jx_/Jy_/Jz_/rho), diagFields timestep - inline void currentsAndDensity( double *Jx, double *Jy, double *Jz, double *rho, Particles &particles, unsigned int ipart, double invgf, int *iold, double *deltaold, int bin_shift = 0 ); + inline void __attribute__((always_inline)) currentsAndDensity( double *Jx, double *Jy, double *Jz, double *rho, Particles &particles, unsigned int ipart, double invgf, int *iold, double *deltaold, int bin_shift = 0 ); //! Project global current charge (EMfields->rho_ , J), for initialization and diags void basic( double *rhoj, Particles &particles, unsigned int ipart, unsigned int type, int bin_shift = 0 ) override final; @@ -36,8 +36,7 @@ class Projector1D2Order : public Projector1D void susceptibilityOnBuffer( ElectroMagn *EMfields, double *b_Chi, int bin_shift, int bdim0, Particles &particles, double species_mass, SmileiMPI *smpi, int istart, int iend, int ithread, int icell = 0, int ipart_ref = 0 ) override final; private: - double dx_ov_dt; - double dt, dts2, dts4; + double dts2_, dts4_; }; #endif diff --git a/src/Projector/Projector1D2OrderGPU.cpp b/src/Projector/Projector1D2OrderGPU.cpp new file mode 100755 index 000000000..79d879024 --- /dev/null +++ b/src/Projector/Projector1D2OrderGPU.cpp @@ -0,0 +1,385 @@ + + +#if defined( SMILEI_ACCELERATOR_MODE ) +#include "Projector1D2OrderGPUKernelCUDAHIP.h" +#include +#include "Tools.h" +#endif + +#include "Projector1D2OrderGPU.h" + +#include "ElectroMagn.h" +#include "Patch.h" +#include "gpu.h" + + +Projector1D2OrderGPU::Projector1D2OrderGPU( Params ¶meters, Patch *a_patch ) + : Projector1D{ parameters, a_patch } +{ + Projector1D::dx_inv_ = 1.0 / parameters.cell_length[0]; + Projector1D::dx_ov_dt_ = parameters.cell_length[0] / parameters.timestep; + Projector1D::i_domain_begin_ = a_patch->getCellStartingGlobalIndex( 0 ); + + not_spectral_ = !parameters.is_pxr; + dts2_ = parameters.timestep / 2.0; + dts4_ = dts2_ / 2.0; +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_OPENACC_MODE ) + x_dimension_bin_count_ = parameters.getGPUBinCount( 1 ); +#else + ERROR( "Only usable in GPU mode! " ); +#endif +} + +Projector1D2OrderGPU::~Projector1D2OrderGPU() +{ +} +#if defined( SMILEI_ACCELERATOR_MODE ) + + +//! Project global current densities (EMfields->Jx_/Jy_/Jz_) +extern "C" void +currentDepositionKernel1DOnDevice( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + int Jx_size, + int Jy_size, + int Jz_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count_, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv_, + double dx_ov_dt_, + int i_domain_begin_, + int not_spectral_ ) +{ + cudahip1d::currentDepositionKernel1D( host_Jx, host_Jy, host_Jz, + Jx_size, Jy_size, Jz_size, + device_particle_position_x, device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + host_bin_index, + x_dimension_bin_count_, + host_invgf_, + host_iold_, host_deltaold_, + inv_cell_volume, + dx_inv_, + dx_ov_dt_, + i_domain_begin_, + not_spectral_ ); +} + + +//! Project global current and charge densities (EMfields->Jx_/Jy_/Jz_/rho_) +//! +extern "C" void +currentAndDensityDepositionKernel1DOnDevice( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + double *__restrict__ host_rho, + int Jx_size, + int Jy_size, + int Jz_size, + int rho_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count_, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv_, + double dx_ov_dt_, + int i_domain_begin_, + int not_spectral_ ) +{ + cudahip1d::currentAndDensityDepositionKernel1D( host_Jx, host_Jy, host_Jz, host_rho, + Jx_size, Jy_size, Jz_size, rho_size, + device_particle_position_x, device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + host_bin_index, + x_dimension_bin_count_, + host_invgf_, + host_iold_, host_deltaold_, + inv_cell_volume, + dx_inv_, + dx_ov_dt_, + i_domain_begin_, + not_spectral_ ); +} +#endif + +// --------------------------------------------------------------------------------------------------------------------- +//! Project charge : frozen & diagFields timstep +// --------------------------------------------------------------------------------------------------------------------- +void Projector1D2OrderGPU::basic( double *rhoj, Particles &particles, unsigned int ipart, unsigned int type, int bin_shift ) +{ + + //Warning : this function is used for frozen species or initialization only and doesn't use the standard scheme. + //rho type = 0 + //Jx type = 1 + //Jy type = 2 + //Jz type = 3 + + // The variable bin received is number of bin * cluster width. + // Declare local variables + int ip; + double xjn, xj_m_xip, xj_m_xip2; + double S1[5]; // arrays used for the Esirkepov projection method + + double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); + if( type > 0 ) { + charge_weight *= 1./sqrt( 1.0 + particles.momentum( 0, ipart )*particles.momentum( 0, ipart ) + + particles.momentum( 1, ipart )*particles.momentum( 1, ipart ) + + particles.momentum( 2, ipart )*particles.momentum( 2, ipart ) ); + + if( type == 1 ) { + charge_weight *= particles.momentum( 0, ipart ); + } else if( type == 2 ) { + charge_weight *= particles.momentum( 1, ipart ); + } else { + charge_weight *= particles.momentum( 2, ipart ); + } + } + + // Initialize variables + for( unsigned int i=0; i<5; i++ ) { + S1[i]=0.; + }//i + + // Locate particle new position on the primal grid + xjn = particles.position( 0, ipart ) * dx_inv_; + ip = round( xjn + 0.5 * ( type==1 ) ); // index of the central node + xj_m_xip = xjn - ( double )ip; // normalized distance to the nearest grid point + xj_m_xip2 = xj_m_xip * xj_m_xip; // square of the normalized distance to the nearest grid point + + // coefficients 2nd order interpolation on 3 nodes + //ip_m_ipo = ip-ipo; + S1[1] = 0.5 * ( xj_m_xip2 - xj_m_xip + 0.25 ); + S1[2] = ( 0.75 - xj_m_xip2 ); + S1[3] = 0.5 * ( xj_m_xip2 + xj_m_xip + 0.25 ); + + ip -= i_domain_begin_ + 2 + bin_shift; + + // 2nd order projection for charge density + // At the 2nd order, oversize = 2. + for( unsigned int i=0; i<5; i++ ) { + rhoj[i + ip ] += charge_weight * S1[i]; + } + +} + + +void Projector1D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, + Particles &particles, + SmileiMPI *smpi, + int, + int, + int ithread, + bool diag_flag, + bool is_spectral, + int ispec, + int icell, + int ipart_ref ) +{ +{ + std::vector &iold = smpi->dynamics_iold[ithread]; + std::vector &delta = smpi->dynamics_deltaold[ithread]; + std::vector &invgf = smpi->dynamics_invgf[ithread]; + + EMfields->rho_->copyFromDeviceToHost(); + EMfields->rho_s[ispec]->copyFromDeviceToHost(); + if( diag_flag ) { + + double *const __restrict__ b_Jx = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->data() : EMfields->Jx_->data(); + unsigned int Jx_size = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->size() : EMfields->Jx_->size(); + + double *const __restrict__ b_Jy = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->data() : EMfields->Jy_->data(); + unsigned int Jy_size = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->size() : EMfields->Jy_->size(); + + double *const __restrict__ b_Jz = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->data() : EMfields->Jz_->data(); + unsigned int Jz_size = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->size() : EMfields->Jz_->size(); + + double *const __restrict__ b_rho = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->data() : EMfields->rho_->data(); + unsigned int rho_size = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->size() : EMfields->rho_->size(); + + // Does not compute Rho ! + +#if defined( SMILEI_ACCELERATOR_MODE ) + /*currentsAndDensity( b_Jx, b_Jy, b_Jz, b_rho, + Jx_size, Jy_size, Jz_size, rho_size, + particles, x_dimension_bin_count_, + invgf.data(), iold.data(), delta.data(), + inv_cell_volume, + dx_inv_, + dx_ov_dt_, + i_domain_begin_, + not_spectral_ );*/ + // to be deleted + std::cout<<"in projector1D2orderGPUKernel.cpp l229: rho_size= "<rho_->copyFromDeviceToHost(); + EMfields->rho_s[ispec]->copyFromDeviceToHost(); + EMfields->Jx_->copyFromDeviceToHost(); + EMfields->Jx_s[ispec]->copyFromDeviceToHost(); + EMfields->Jy_->copyFromDeviceToHost(); + EMfields->Jy_s[ispec]->copyFromDeviceToHost(); + EMfields->Jz_->copyFromDeviceToHost(); + EMfields->Jz_s[ispec]->copyFromDeviceToHost(); + std::cout<<"in projector1D2orderGPUKernel.cpp l251 after projection: rho_size= "<Jx_->data(); + Jy_ = EMfields->Jy_->data(); + Jz_ = EMfields->Jz_->data(); + rho_ = EMfields->rho_->data(); + + /*currents( Jx_, Jy_, Jz_, + EMfields->Jx_->size(), EMfields->Jy_->size(), EMfields->Jz_->size(), + particles, x_dimension_bin_count_, y_dimension_bin_count_, + invgf.data(), iold.data(), delta.data(), + inv_cell_volume, + dx_inv_, dy_inv_, + dx_ov_dt_, dy_ov_dt_, + i_domain_begin_, j_domain_begin_, + nprimy, + one_third, + not_spectral_ ); + } + double *const __restrict__ b_Jx = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->data() : EMfields->Jx_->data(); + unsigned int Jx_size = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->size() : EMfields->Jx_->size(); + + double *const __restrict__ b_Jy = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->data() : EMfields->Jy_->data(); + unsigned int Jy_size = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->size() : EMfields->Jy_->size(); + + double *const __restrict__ b_Jz = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->data() : EMfields->Jz_->data(); + unsigned int Jz_size = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->size() : EMfields->Jz_->size();//*/ + /*Jx_ = EMfields->Jx_->data(); + Jy_ = EMfields->Jy_->data(); + Jz_ = EMfields->Jz_->data();*/ + + /*currents( Jx_, Jy_, Jz_, + EMfields->Jx_->size(), EMfields->Jy_->size(), EMfields->Jz_->size(), + particles, x_dimension_bin_count_, + invgf.data(), iold.data(), delta.data(), + inv_cell_volume, + dx_inv_, + dx_ov_dt_, + i_domain_begin_, + not_spectral_ );*/ +#if defined( SMILEI_ACCELERATOR_MODE ) + //double *device_Jx = smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( b_Jx ) ; + //printf("testing device Jx:, %p \n", device_Jx); + currentDepositionKernel1DOnDevice(Jx_, Jy_, Jz_, //b_Jx,b_Jy,b_Jz, + //Jx_size, Jy_size, Jz_size, + EMfields->Jx_->size(), EMfields->Jy_->size(), EMfields->Jz_->size(), + particles.getPtrPosition( 0 ), + particles.getPtrMomentum( 1 ), + particles.getPtrMomentum( 2 ), + particles.getPtrCharge(), + particles.getPtrWeight(), + particles.last_index.data(), + x_dimension_bin_count_, + invgf.data(), + iold.data(), + delta.data(), + inv_cell_volume, + dx_inv_, + dx_ov_dt_, + i_domain_begin_, + not_spectral_ ); +#else + SMILEI_ASSERT( false ); +#endif + } + } +} +// to be deleted +{ + double *const __restrict__ b_Jx = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->data() : EMfields->Jx_->data(); + unsigned int Jx_size = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->size() : EMfields->Jx_->size(); + + double *const __restrict__ b_Jy = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->data() : EMfields->Jy_->data(); + unsigned int Jy_size = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->size() : EMfields->Jy_->size(); + + double *const __restrict__ b_Jz = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->data() : EMfields->Jz_->data(); + unsigned int Jz_size = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->size() : EMfields->Jz_->size(); + + double *const __restrict__ b_rho = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->data() : EMfields->rho_->data(); + unsigned int rho_size = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->size() : EMfields->rho_->size(); + + std::cout<<"in projector1D2orderGPUKernel.cpp l336: rho_size= "<rho_s[ispec] ? " + << EMfields->rho_s[ispec] << " Jx_size " << Jx_size<< " Jy_size " << Jy_size<< " Jz_size " << Jz_size<< std::endl; + for( int ipart=0 ; ipart +#elif defined( __NVCC__ ) + #include + #include +#endif + +#include "Params.h" +#include "gpu.h" +#include + +#if defined( __HIP__ ) + // HIP compiler support enabled (for .cu files) +#else + #define PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION 1 +#endif + +#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) + #include + #include "Tools.h" +#else + #include + + #include "Params.h" + #include "gpu.h" +#endif + +// #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) + +//namespace naive { +// +// void //static inline void +// currentDepositionKernel2D( double *__restrict__ Jx, +// double *__restrict__ Jy, +// double *__restrict__ Jz, +// int Jx_size, +// int Jy_size, +// int Jz_size, +// const double *__restrict__ device_particle_position_x, +// const double *__restrict__ device_particle_momentum_y, +// const double *__restrict__ device_particle_momentum_z, +// const short *__restrict__ device_particle_charge, +// const double *__restrict__ device_particle_weight, +// const int *__restrict__ host_bin_index, +// unsigned int x_dimension_bin_count, +// const double *__restrict__ invgf_, +// const int *__restrict__ iold_, +// const double *__restrict__ deltaold_, +// double inv_cell_volume, +// double dx_inv, +// double dx_ov_dt, +// int i_domain_begin, +// int not_spectral_ ) +// { +// // The OMP implementation is NOT bin aware. As per the precondition on +// // host_bin_index, index zero always contains the number of particles. +// // See nvidiaParticles::prepareBinIndex / setHostBinIndex. +// const unsigned int bin_count = 1; +// const int particle_count = host_bin_index[bin_count - 1]; +// +// #if defined( SMILEI_ACCELERATOR_GPU_OMP ) +// #pragma omp target is_device_ptr /* map */ ( /* to: */ \ +// device_particle_position_x /* [0:particle_count] */, \ +// device_particle_momentum_y /* [0:particle_count] */, \ +// device_particle_momentum_z /* [0:particle_count] */, \ +// device_particle_charge /* [0:particle_count] */, \ +// device_particle_weight /* [0:particle_count] */ ) +// #pragma omp teams thread_limit( 64 ) distribute parallel for +// #elif defined( SMILEI_OPENACC_MODE ) +// #pragma acc parallel \ +// deviceptr( device_particle_position_x, \ +// device_particle_momentum_y, \ +// device_particle_momentum_z, \ +// device_particle_charge, \ +// device_particle_weight ) \ +// present( iold [0:3 * particle_count], \ +// deltaold [0:3 * particle_count] ) +// #pragma acc loop gang worker vector +// #endif +// for( int particle_index = 0; particle_index < particle_count; ++particle_index ) { +// const double invgf = invgf_[particle_index]; +// const int *const __restrict__ iold = &iold_[particle_index]; +// const double *const __restrict__ deltaold = &deltaold_[particle_index]; +// +// double Sx0[5]; +// double Sx1[5]; +// +// // Variable declaration & initialization +// // Esirkepov's paper: https://arxiv.org/pdf/physics/9901047.pdf +// +// // Locate the particle on the primal grid at former time-step & calculate coeff. S0 +// { +// const double delta = deltaold[0 * particle_count]; +// const double delta2 = delta * delta; +// Sx0[0] = 0.0; +// Sx0[1] = 0.5 * ( delta2 - delta + 0.25 ); +// Sx0[2] = 0.75 - delta2; +// Sx0[3] = 0.5 * ( delta2 + delta + 0.25 ); +// Sx0[4] = 0.0; +// } +// +// // Locate the particle on the primal grid at current time-step & calculate coeff. S1 +// { +// const double xpn = device_particle_position_x[particle_index] * dx_inv; +// const int ip = std::round( xpn ); +// const int ipo = iold[0 * particle_count]; +// const int ip_m_ipo = ip - ipo - i_domain_begin; +// const double delta = xpn - static_cast( ip ); +// const double delta2 = delta * delta; +// +// Sx1[0] = 0.0; +// Sx1[1] = 0.0; +// // Sx1[2] = 0.0; // Always set below +// Sx1[3] = 0.0; +// Sx1[4] = 0.0; +// +// Sx1[ip_m_ipo + 1] = 0.5 * ( delta2 - delta + 0.25 ); +// Sx1[ip_m_ipo + 2] = 0.75 - delta2; +// Sx1[ip_m_ipo + 3] = 0.5 * ( delta2 + delta + 0.25 ); +// } +// +// // (x,y,z) components of the current density for the macro-particle +// const double charge_weight = inv_cell_volume * static_cast( device_particle_charge[particle_index] ) * device_particle_weight[particle_index]; +// const double crx_p = charge_weight * dx_ov_dt; +// const double cry_p = charge_weight * dy_ov_dt; +// const double crz_p = charge_weight * ( 1.0 / 3.0 ) * device_particle_momentum_z[particle_index] * invgf; +// +// // This is the particle position as grid index +// // This minus 2 come from the order 2 scheme, based on a 5 points stencil from -2 to +2. +// const int ipo = iold[0 * particle_count] - 2; +// +// for( unsigned int i = 0; i < 1; ++i ) { +// const int iloc = ( i + ipo ) ; +// /* Jx[iloc] += tmpJx[0]; */ +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc] += crz_p * ( Sy1[0] * ( /* 0.5 * Sx0[i] + */ Sx1[i] ) ); +// double tmp = 0.0; +// for( unsigned int j = 1; j < 5; j++ ) { +// tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jy[iloc + j + not_spectral_ * ( /* i + */ ipo )] += tmp; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] /* + Sx0[i] */ ) + +// Sy1[j] * ( /* 0.5 * Sx0[i] + */ Sx1[i] ) ); +// } +// } +// +// double tmpJx[5]{}; +// +// for( unsigned int i = 1; i < 5; ++i ) { +// const int iloc = ( i + ipo ) ; +// tmpJx[0] -= crx_p * ( Sx1[i - 1] - Sx0[i - 1] ) * ( 0.5 * ( Sy1[0] - Sy0[0] ) ); +// SMILEI_ACCELERATOR_ATOMIC +// Jx[iloc] += tmpJx[0]; +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc] += crz_p * ( Sy1[0] * ( 0.5 * Sx0[i] + Sx1[i] ) ); +// double tmp = 0.0; +// for( unsigned int j = 1; j < 5; ++j ) { +// tmpJx[j] -= crx_p * ( Sx1[i - 1] - Sx0[i - 1] ) * ( Sy0[j] + 0.5 * ( Sy1[j] - Sy0[j] ) ); +// SMILEI_ACCELERATOR_ATOMIC +// Jx[iloc + j] += tmpJx[j]; +// tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); +// SMILEI_ACCELERATOR_ATOMIC +// Jy[iloc + j + not_spectral_ * ( i + ipo )] += tmp; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] + Sx0[i] ) + +// Sy1[j] * ( 0.5 * Sx0[i] + Sx1[i] ) ); +// } +// } +// } +// } // end currentDepositionKernel +// +// //static inline +// void +// currentAndDensityDepositionKernel( double *__restrict__ Jx, +// double *__restrict__ Jy, +// double *__restrict__ Jz, +// double *__restrict__ rho, +// int Jx_size, +// int Jy_size, +// int Jz_size, +// int rho_size, +// const double *__restrict__ device_particle_position_x, +// const double *__restrict__ device_particle_momentum_y, +// const double *__restrict__ device_particle_momentum_z, +// const short *__restrict__ device_particle_charge, +// const double *__restrict__ device_particle_weight, +// const int *__restrict__ host_bin_index, +// unsigned int, +// unsigned int, +// const double *__restrict__ invgf_, +// const int *__restrict__ iold_, +// const double *__restrict__ deltaold_, +// double inv_cell_volume, +// double dx_inv, +// double dx_ov_dt, +// int i_domain_begin, +// int not_spectral_ ) +// { +// // The OMP implementation is NOT bin aware. As per the precondition on +// // host_bin_index, index zero always contains the number of particles. +// // See nvidiaParticles::prepareBinIndex / setHostBinIndex. +// const unsigned int bin_count = 1; +// const int particle_count = host_bin_index[bin_count - 1]; +// +// #if defined( SMILEI_ACCELERATOR_GPU_OMP ) +// #pragma omp target is_device_ptr /* map */ ( /* to: */ \ +// device_particle_position_x /* [0:particle_count] */, \ +// device_particle_momentum_y /* [0:particle_count] */, \ +// device_particle_momentum_z /* [0:particle_count] */, \ +// device_particle_charge /* [0:particle_count] */, \ +// device_particle_weight /* [0:particle_count] */ ) +// #pragma omp teams thread_limit( 64 ) distribute parallel for +// #elif defined( SMILEI_OPENACC_MODE ) +// #pragma acc parallel \ +// deviceptr( device_particle_position_x, \ +// device_particle_momentum_y, \ +// device_particle_momentum_z, \ +// device_particle_charge, \ +// device_particle_weight ) \ +// present( iold [0:3 * particle_count], \ +// deltaold [0:3 * particle_count] ) +// #pragma acc loop gang worker vector +// #endif +// for( int particle_index = 0; particle_index < particle_count; ++particle_index ) { +// const double invgf = invgf_[particle_index]; +// const int *const __restrict__ iold = &iold_[particle_index]; +// const double *const __restrict__ deltaold = &deltaold_[particle_index]; +// +// double Sx0[5]; +// double Sx1[5]; +// double Sy0[5]; +// double Sy1[5]; +// +// // Variable declaration & initialization +// // Esirkepov's paper: https://arxiv.org/pdf/physics/9901047.pdf +// +// // Locate the particle on the primal grid at former time-step & calculate coeff. S0 +// { +// const double delta = deltaold[0 * particle_count]; +// const double delta2 = delta * delta; +// Sx0[0] = 0.0; +// Sx0[1] = 0.5 * ( delta2 - delta + 0.25 ); +// Sx0[2] = 0.75 - delta2; +// Sx0[3] = 0.5 * ( delta2 + delta + 0.25 ); +// Sx0[4] = 0.0; +// } +// // Locate the particle on the primal grid at current time-step & calculate coeff. S1 +// { +// const double xpn = device_particle_position_x[particle_index] * dx_inv; +// const int ip = std::round( xpn ); +// const int ipo = iold[0 * particle_count]; +// const int ip_m_ipo = ip - ipo - i_domain_begin; +// const double delta = xpn - static_cast( ip ); +// const double delta2 = delta * delta; +// +// Sx1[0] = 0.0; +// Sx1[1] = 0.0; +// // Sx1[2] = 0.0; // Always set below +// Sx1[3] = 0.0; +// Sx1[4] = 0.0; +// +// Sx1[ip_m_ipo + 1] = 0.5 * ( delta2 - delta + 0.25 ); +// Sx1[ip_m_ipo + 2] = 0.75 - delta2; +// Sx1[ip_m_ipo + 3] = 0.5 * ( delta2 + delta + 0.25 ); +// } +// +// // (x,y,z) components of the current density for the macro-particle +// const double charge_weight = inv_cell_volume * static_cast( device_particle_charge[particle_index] ) * device_particle_weight[particle_index]; +// const double crx_p = charge_weight * dx_ov_dt; +// const double cry_p = charge_weight * dy_ov_dt; +// const double crz_p = charge_weight * ( 1.0 / 3.0 ) * device_particle_momentum_z[particle_index] * invgf; +// +// // This is the particle position as grid index +// // This minus 2 come from the order 2 scheme, based on a 5 points stencil from -2 to +2. +// const int ipo = iold[0 * particle_count] - 2; +// const int jpo = iold[1 * particle_count] - 2; +// +// // case i =0 +// for( unsigned int i = 0; i < 1; ++i ) { +// const int iloc = ( i + ipo ) ; +// /* Jx[iloc] += tmpJx[0]; */ +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc] += crz_p * ( Sy1[0] * ( /* 0.5 * Sx0[i] + */ Sx1[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// rho[iloc] += charge_weight * Sx1[0] * Sy1[0]; +// double tmp = 0.0; +// for( unsigned int j = 1; j < 5; j++ ) { +// tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jy[iloc + j + not_spectral_ * ( /* i + */ ipo )] += tmp; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] /* + Sx0[i] */ ) + +// Sy1[j] * ( /* 0.5 * Sx0[i] + */ Sx1[i] ) ); +// SMILEI_ACCELERATOR_ATOMIC +// rho[iloc + j] += charge_weight * Sx1[0] * Sy1[j]; +// } +// } +// +// double tmpJx[5]{}; +// +// // case i> 0 +// for( unsigned int i = 1; i < 5; ++i ) { +// const int iloc = i + ipo ; +// tmpJx[0] -= crx_p * ( Sx1[i - 1] - Sx0[i - 1] ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jx[iloc] += tmpJx[0]; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc] += crz_p * ( Sy1[0] * ( 0.5 * Sx0[i] + Sx1[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// rho[iloc] += charge_weight * Sx1[i] * Sy1[0]; +// +// double tmp = 0.0; +// for( unsigned int j = 1; j < 5; ++j ) { +// tmpJx[j] -= crx_p * ( Sx1[i - 1] - Sx0[i - 1] ) * ( Sy0[j] + 0.5 * ( Sy1[j] - Sy0[j] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jx[iloc + j] += tmpJx[j]; +// tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jy[iloc + j + not_spectral_ * ( i + ipo )] += tmp; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] + Sx0[i] ) + +// Sy1[j] * ( 0.5 * Sx0[i] + Sx1[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// rho[iloc + j] += charge_weight * Sx1[i] * Sy1[j]; +// } +// } +// } +// } // end currentDepositionKernel +// +// +//} // namespace naive +// +// #else + +namespace cudahip1d { + namespace detail { +#if defined( __HIP__ ) + static inline void + checkErrors( ::hipError_t an_error_code, + const char *file_name, + int line ) + { + if( an_error_code != ::hipError_t::hipSuccess ) { + std::cout << "HIP error at " << file_name << ":" << line + << " -> " << ::hipGetErrorString( an_error_code ) << std::endl; + std::exit( EXIT_FAILURE ); + } + } +// For NVIDIA compiler +#elif defined( __NVCC__ ) + static inline void + checkErrors( ::cudaError_t an_error_code, + const char *file_name, + int line ) + { + if( an_error_code != ::cudaError_t::cudaSuccess ) { + std::cout << "CUDA error at " << file_name << ":" << line << " -> " << ::cudaGetErrorString( an_error_code ) << std::endl; + std::exit( EXIT_FAILURE ); + } + } +#endif + + } // namespace detail + + #define checkHIPErrors( an_expression ) \ + do { \ + detail::checkErrors( an_expression, __FILE__, __LINE__ ); \ + } while( 0 ) + + namespace kernel { + namespace atomic { + namespace LDS { + __device__ void + AddNoReturn( float *a_pointer, float a_value ) + { + #if defined( __gfx90a__ ) + ::unsafeAtomicAdd( a_pointer, a_value ); + #else + ::atomicAdd( a_pointer, a_value ); + #endif + } + + __device__ void + AddNoReturn( double *a_pointer, double a_value ) + { + #if defined( __gfx90a__ ) + ::unsafeAtomicAdd( a_pointer, a_value ); + #else + ::atomicAdd( a_pointer, a_value ); + #endif + } + } // namespace LDS + + namespace GDS { + __device__ void + AddNoReturn( double *a_pointer, double a_value ) + { + #if defined( __gfx90a__ ) + ::unsafeAtomicAdd( a_pointer, a_value ); + #else + ::atomicAdd( a_pointer, a_value ); + #endif + } + } // namespace GDS + } // namespace atomic + + + template + __device__ void inline __attribute__((always_inline)) init_S0(const ComputeFloat delta, ComputeFloat *__restrict__ S0) + { + const ComputeFloat delta2 = delta * delta; + S0[0] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + S0[1] = static_cast( 0.75 ) - delta2; + S0[2] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + S0[3] = static_cast( 0.0 ) ; + } + + template + __device__ void inline __attribute__((always_inline)) init_S1(const ComputeFloat xpn, const int ipo, const int i_domain_begin, + ComputeFloat *__restrict__ S1) + { + // const int ip = static_cast( xpn + 0.5 ); // std::round | rounding approximation which is correct enough and faster in this case + const int ip = std::round( xpn ); + const int ip_m_ipo = ip - ipo - i_domain_begin; + const ComputeFloat delta = xpn - static_cast( ip ); + const ComputeFloat delta2 = delta * delta; + + S1[0] = static_cast( 0.0 ); + S1[1] = static_cast( 0.0 ); // S1[2] = 0.0; // Always set below + S1[3] = static_cast( 0.0 ); + S1[4] = static_cast( 0.0 ); + + S1[ip_m_ipo + 1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + S1[ip_m_ipo + 2] = static_cast( 0.75 ) - delta2; + S1[ip_m_ipo + 3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + } + + + template + __global__ void + // __launch_bounds__(kWorkgroupSize, 1) + DepositCurrentDensity_1D_Order2( double *__restrict__ device_Jx, + double *__restrict__ device_Jy, + double *__restrict__ device_Jz, + int Jx_size, + int Jy_size, + int Jz_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ device_bin_index, + const double *__restrict__ device_invgf_, + const int *__restrict__ device_iold_, + const double *__restrict__ device_deltaold_, + ComputeFloat inv_cell_volume, + ComputeFloat dx_inv, + ComputeFloat dx_ov_dt, + int i_domain_begin, + int not_spectral_ ) + { + /*printf(" Hallooo \n"); + for (int i=0; i( 0.0 ); + Jy_scratch_space[field_index] = static_cast( 0.0 ); + Jz_scratch_space[field_index] = static_cast( 0.0 ); + } + + __syncthreads(); + + const unsigned int particle_count = device_bin_index[bin_count - 1]; + + // This workgroup has to process distance(last_particle, + // first_particle) particles + const unsigned int first_particle = workgroup_dedicated_bin_index == 0 ? 0 : device_bin_index[workgroup_dedicated_bin_index - 1]; + const unsigned int last_particle = device_bin_index[workgroup_dedicated_bin_index]; + + //printf("first_particle %d last_particle %d particle_count %d\n", first_particle, last_particle, particle_count); + + for( unsigned int particle_index = first_particle + thread_index_offset; + particle_index < last_particle; + particle_index += loop_stride ) { + const ComputeFloat invgf = static_cast( device_invgf_[particle_index] ); + const int *const __restrict__ iold = &device_iold_[particle_index]; + const double *const __restrict__ deltaold = &device_deltaold_[particle_index]; + + ComputeFloat Sx0[5]; + ComputeFloat Sx1[5]; + + // Variable declaration & initialization + // Esirkepov's paper: https://arxiv.org/pdf/physics/9901047.pdf + + // Locate the particle on the primal grid at former time-step & calculate coeff. S0 + { + const ComputeFloat delta = deltaold[0 * particle_count]; + const ComputeFloat delta2 = delta * delta; + + Sx0[0] = static_cast( 0.0 ); + Sx0[1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + Sx0[2] = static_cast( 0.75 ) - delta2; + Sx0[3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + Sx0[4] = static_cast( 0.0 ); + } + //init_S0(deltaold[0 * particle_count], Sx0); + //init_S0(deltaold[1 * particle_count], Sy0); + + // Locate the particle on the primal grid at current time-step & calculate coeff. S1 + { + // const int ip = static_cast( xpn + 0.5 ); // std::round | rounding approximation which is correct enough and faster in this case + const ComputeFloat xpn = static_cast( device_particle_position_x[particle_index] ) * dx_inv; + const int ip = std::round( xpn ); + const int ipo = iold[0 * particle_count]; + const int ip_m_ipo = ip - ipo - i_domain_begin; + const ComputeFloat delta = xpn - static_cast( ip ); + const ComputeFloat delta2 = delta * delta; + + Sx1[0] = static_cast( 0.0 ); + Sx1[1] = static_cast( 0.0 ); + // Sx1[2] = 0.0; // Always set below + Sx1[3] = static_cast( 0.0 ); + Sx1[4] = static_cast( 0.0 ); + + Sx1[ip_m_ipo + 1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + Sx1[ip_m_ipo + 2] = static_cast( 0.75 ) - delta2; + Sx1[ip_m_ipo + 3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + } + + // (x,y,z) components of the current density for the macro-particle + const ComputeFloat charge_weight = inv_cell_volume * static_cast( device_particle_charge[particle_index] ) * static_cast( device_particle_weight[particle_index] ); + const ComputeFloat crx_p = charge_weight * dx_ov_dt; + const ComputeFloat cry_p = charge_weight * static_cast( device_particle_momentum_y[particle_index] ) * invgf; + const ComputeFloat crz_p = charge_weight * static_cast( device_particle_momentum_z[particle_index] ) * invgf; + + // This is the particle position as grid index + // This minus 2 come from the order 2 scheme, based on a 5 points stencil from -2 to +2. + const int ipo = iold[0 * particle_count] - + 2 /* Offset so we dont uses negative numbers in the loop */ - + global_x_scratch_space_coordinate_offset /* Offset to get cluster relative coordinates */; + + // Jx + ComputeFloat tmpJx[5]{}; + for( unsigned int i = 1; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = tmpJx[i-1] + crx_p * (Sx0[i-1] - Sx1[i-1]); + atomic::LDS::AddNoReturn( &Jx_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Jy + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = cry_p * 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jy_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Jz + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = crz_p * 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jz_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + } // particle_index + + __syncthreads(); + + for( unsigned int field_index = thread_index_offset; field_index < kFieldScratchSpaceSize; field_index += workgroup_size ) { + const unsigned int local_x_scratch_space_coordinate = field_index % GPUClusterWithGCWidth; // /GPUClusterWithGCWidth + const unsigned int global_x_scratch_space_coordinate = global_x_scratch_space_coordinate_offset + local_x_scratch_space_coordinate; + + const unsigned int global_memory_index = global_x_scratch_space_coordinate; + const unsigned int scratch_space_index = field_index; // local_x_scratch_space_coordinate * GPUClusterWithGCWidth + local_y_scratch_space_coordinate; + + //printf("field_index %d, thread_index_offset %d, kFieldScratchSpaceSize %d, workgroup_size %d, GPUClusterWithGCWidth %d, global_x_scratch_space_coordinate_offset %d, global_memory_index %d, Jx_size %d\n",field_index, thread_index_offset, kFieldScratchSpaceSize, workgroup_size, GPUClusterWithGCWidth, global_x_scratch_space_coordinate_offset, global_memory_index, Jx_size); + + // These atomics are basically free (very few of them). + atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + not_spectral_ * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); // We handle the FTDT/picsar + atomic::GDS::AddNoReturn( &device_Jz[global_memory_index], static_cast( Jz_scratch_space[scratch_space_index] ) ); + } + } // end DepositCurrent + + + template + __global__ void + // __launch_bounds__(kWorkgroupSize, 1) + DepositCurrentAndDensity_1D_Order2( double *__restrict__ device_Jx, + double *__restrict__ device_Jy, + double *__restrict__ device_Jz, + double *__restrict__ device_rho, + int Jx_size, + int Jy_size, + int Jz_size, + int rho_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ device_bin_index, + const double *__restrict__ device_invgf_, + const int *__restrict__ device_iold_, + const double *__restrict__ device_deltaold_, + ComputeFloat inv_cell_volume, + ComputeFloat dx_inv, + ComputeFloat dx_ov_dt, + int i_domain_begin, + int not_spectral_ ) + { + // TODO(Etienne M): refactor this function. Break it into smaller + // pieces (lds init/store, coeff computation, deposition etc..) + // TODO(Etienne M): __ldg could be used to slightly improve GDS load + // speed. This would only have an effect on Nvidia cards as this + // operation is a no op on AMD. + const unsigned int workgroup_size = kWorkgroupSize; // blockDim.x; + const unsigned int bin_count = gridDim.x; + const unsigned int loop_stride = workgroup_size; // This stride should enable better memory access coalescing + + const unsigned int x_cluster_coordinate = blockIdx.x; + const unsigned int workgroup_dedicated_bin_index = x_cluster_coordinate ; + const unsigned int thread_index_offset = threadIdx.x; + + // The unit is the cell + const unsigned int global_x_scratch_space_coordinate_offset = x_cluster_coordinate * Params::getGPUClusterWidth( 1 /* 1D */ ); + + // NOTE: We gain from the particles not being sorted inside a + // cluster because it reduces the bank conflicts one gets when + // multiple threads access the same part of the shared memory. Such + // "conflicted" accesses are serialized ! + // NOTE: We use a bit to much LDS. For Jx, the first row could be + // discarded, for Jy we could remove the first column. + + const int GPUClusterWithGCWidth = Params::getGPUClusterWithGhostCellWidth( 1 /* 1D */, 2 /* 2nd order interpolation */ ); + static constexpr unsigned int kFieldScratchSpaceSize = Params::getGPUInterpolationClusterCellVolume( 1 /* 1D */, 2 /* 2nd order interpolation */ ); + + // NOTE: I tried having only one cache and reusing it. Doing that + // requires you to iterate multiple time over the particle which is + // possible but cost more bandwidth. The speedup was ~x0.92. + __shared__ ReductionFloat Jx_scratch_space[kFieldScratchSpaceSize]; + __shared__ ReductionFloat Jy_scratch_space[kFieldScratchSpaceSize]; + __shared__ ReductionFloat Jz_scratch_space[kFieldScratchSpaceSize]; + __shared__ ReductionFloat rho_scratch_space[kFieldScratchSpaceSize]; + + // Init the shared memory + + for( unsigned int field_index = thread_index_offset; + field_index < kFieldScratchSpaceSize; + field_index += workgroup_size ) { + Jx_scratch_space[field_index] = static_cast( 0.0 ); + Jy_scratch_space[field_index] = static_cast( 0.0 ); + Jz_scratch_space[field_index] = static_cast( 0.0 ); + rho_scratch_space[field_index] = static_cast( 0.0 ); + } + + __syncthreads(); + + const unsigned int particle_count = device_bin_index[bin_count - 1]; + + // This workgroup has to process distance(last_particle, + // first_particle) particles + const unsigned int first_particle = workgroup_dedicated_bin_index == 0 ? 0 : device_bin_index[workgroup_dedicated_bin_index - 1]; + const unsigned int last_particle = device_bin_index[workgroup_dedicated_bin_index]; + + //printf(" first_particle %d last_particle %d loopstride %d \n",first_particle, last_particle, loop_stride); + + for( unsigned int particle_index = first_particle + thread_index_offset; + particle_index < last_particle; + particle_index += loop_stride ) { + const ComputeFloat invgf = static_cast( device_invgf_[particle_index] ); + const int *const __restrict__ iold = &device_iold_[particle_index]; + const double *const __restrict__ deltaold = &device_deltaold_[particle_index]; + + //printf("in projector cuda l735: particle charge= %f weight %f position_x= %f, momentum y = %f, momentum z = %f, charge*sqrt(2) %+4.15e \n", static_cast( device_particle_charge[particle_index]) , static_cast( device_particle_weight[particle_index]), + // static_cast( device_particle_position_x[particle_index] ), static_cast( device_particle_momentum_y[particle_index] ), + // static_cast( device_particle_momentum_z[particle_index] ), static_cast( device_particle_charge[particle_index]) * static_cast(sqrt(2.0))); + + ComputeFloat Sx0[5]; + ComputeFloat Sx1[5]; + + // Variable declaration & initialization + // Esirkepov's paper: https://arxiv.org/pdf/physics/9901047.pdf + + // Locate the particle on the primal grid at former time-step & calculate coeff. S0 + { + const ComputeFloat delta = deltaold[0 * particle_count]; + const ComputeFloat delta2 = delta * delta; + + Sx0[0] = static_cast( 0.0 ); + Sx0[1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + Sx0[2] = static_cast( 0.75 ) - delta2; + Sx0[3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + Sx0[4] = static_cast( 0.0 ); + } + + // Locate the particle on the primal grid at current time-step & calculate coeff. S1 + { + // const int ip = static_cast( xpn + 0.5 ); // std::round | rounding approximation which is correct enough and faster in this case + const ComputeFloat xpn = static_cast( device_particle_position_x[particle_index] ) * dx_inv; + const int ip = std::round( xpn ); + const int ipo = iold[0 * particle_count]; + const int ip_m_ipo = ip - ipo - i_domain_begin; + const ComputeFloat delta = xpn - static_cast( ip ); + const ComputeFloat delta2 = delta * delta; + + Sx1[0] = static_cast( 0.0 ); + Sx1[1] = static_cast( 0.0 ); + // Sx1[2] = 0.0; // Always set below + Sx1[3] = static_cast( 0.0 ); + Sx1[4] = static_cast( 0.0 ); + + Sx1[ip_m_ipo + 1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + Sx1[ip_m_ipo + 2] = static_cast( 0.75 ) - delta2; + Sx1[ip_m_ipo + 3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + } + + // (x,y,z) components of the current density for the macro-particle + const ComputeFloat charge_weight = inv_cell_volume * static_cast( device_particle_charge[particle_index] ) * static_cast( device_particle_weight[particle_index] ); + const ComputeFloat crx_p = charge_weight * dx_ov_dt; + const ComputeFloat cry_p = charge_weight * static_cast( device_particle_momentum_y[particle_index] ) * invgf; + const ComputeFloat crz_p = charge_weight * static_cast( device_particle_momentum_z[particle_index] ) * invgf; + + // This is the particle position as grid index + // This minus 2 come from the order 2 scheme, based on a 5 points stencil from -2 to +2. + const int ipo = iold[0 * particle_count] - + 2 /* Offset so we dont uses negative numbers in the loop */ - + global_x_scratch_space_coordinate_offset /* Offset to get cluster relative coordinates */; + + // Jx + ComputeFloat tmpJx[5]{}; + for( unsigned int i = 1; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = tmpJx[i-1] + crx_p * (Sx0[i-1] - Sx1[i-1]); + atomic::LDS::AddNoReturn( &Jx_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Jy + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = cry_p * 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jy_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Jz + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = crz_p * 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jz_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Rho + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + atomic::LDS::AddNoReturn( &rho_scratch_space[iloc], static_cast( charge_weight * Sx1[i] ) ); + } + + // improvements ideas: 1. unrolling to reduce the size of Sx0 and Sx1 + // 2. combine the loops + + /* + // + { + //ComputeFloat tmp = 0.5 * (Sx0[0] - Sx1[0]); // = - 0.5 * Sx1[0] + atomic::LDS::AddNoReturn( &Jy_scratch_space[ipo], static_cast( -cry_p * 0.5 * Sx1[0] ) ); + atomic::LDS::AddNoReturn( &Jz_scratch_space[ipo], static_cast( -crz_p * 0.5 * Sx1[0] ) ); + atomic::LDS::AddNoReturn( &rho_scratch_space[ipo], static_cast( charge_weight * Sx1[0] ) ); + }*/ + /*for( unsigned int i = 1; i < 4; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = tmpJx[i-1] + crx_p * (Sx0[i-1] - Sx1[i-1]); + ComputeFloat tmp = 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jx_scratch_space[iloc], static_cast( tmpJx[i] ) ); + atomic::LDS::AddNoReturn( &Jy_scratch_space[iloc], static_cast( cry_p * tmp ) ); + atomic::LDS::AddNoReturn( &Jz_scratch_space[iloc], static_cast( crz_p * tmp ) ); + atomic::LDS::AddNoReturn( &rho_scratch_space[iloc], static_cast( charge_weight * Sx1[i] ) ); + }*/ + /* i=4 + { + const int iloc = i + ipo; + tmpJx[4] = tmpJx[3] + crx_p * (Sx0[i-1] - Sx1[i-1]); // can save some registers by tmpJx[0] instead of tmpJx[4] ? reducing its size from 5 to 4? + //ComputeFloat tmp = 0.5 * (Sx0[4] - Sx1[4]); // = -0.5 * Sx1[4] + atomic::LDS::AddNoReturn( &Jx_scratch_space[iloc], static_cast( tmpJx[i] ) ); + atomic::LDS::AddNoReturn( &Jy_scratch_space[iloc], static_cast( -cry_p * 0.5 * Sx1[4] ) ); //null + atomic::LDS::AddNoReturn( &Jz_scratch_space[iloc], static_cast( -crz_p * 0.5 * Sx1[4] ) ); //null + atomic::LDS::AddNoReturn( &rho_scratch_space[iloc], static_cast( charge_weight * Sx1[4] ) ); //null + } + + + */ + + } // particle_index + + __syncthreads(); + + for( unsigned int field_index = thread_index_offset; + field_index < kFieldScratchSpaceSize; + field_index += workgroup_size ) { + + const unsigned int local_x_scratch_space_coordinate = field_index % GPUClusterWithGCWidth; + const unsigned int global_x_scratch_space_coordinate = global_x_scratch_space_coordinate_offset + local_x_scratch_space_coordinate; + + const unsigned int global_memory_index = global_x_scratch_space_coordinate; + const unsigned int scratch_space_index = field_index; // local_x_scratch_space_coordinate * GPUClusterWithGCWidth + local_y_scratch_space_coordinate; + + // These atomics are basically free (very few of them). + atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jz[global_memory_index], static_cast( Jz_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_rho[global_memory_index], static_cast( rho_scratch_space[scratch_space_index] ) ); + } + } + } // namespace kernel + + + //static inline + void + currentDepositionKernel1D( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + int Jx_size, + int Jy_size, + int Jz_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ) + { + SMILEI_ASSERT( Params::getGPUClusterWidth( 1 /* 1D */ ) != -1 && + Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); + + // NOTE: + // This cluster is very strongly bound by atomic operations in LDS (shared memory) + // TODO(Etienne M): Find a way to lessen the atomic usage + + const ::dim3 kGridDimension { static_cast( x_dimension_bin_count ), 1, 1 }; + + static constexpr std::size_t kWorkgroupSize = 128; + const ::dim3 kBlockDimension{ static_cast( kWorkgroupSize ), 1, 1 }; + + // NOTE: On cards lacking hardware backed Binary64 atomic operations, + // falling back to Binary32 (supposing hardware support for atomic + // operations) can lead to drastic performance improvement. + // One just need to assign 'float' to ReductionFloat. + // + using ComputeFloat = double; + using ReductionFloat = double; + + auto KernelFunction = kernel::DepositCurrentDensity_1D_Order2; +#if defined ( __HIP__ ) + hipLaunchKernelGGL( KernelFunction, + kGridDimension, + kBlockDimension, + 0, // Shared memory + 0, // Stream + // Kernel arguments + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jy ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jz ), + Jx_size, Jy_size, Jz_size, + device_particle_position_x, + device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_bin_index ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_invgf_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_iold_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_deltaold_ ), + inv_cell_volume, + dx_inv, + dx_ov_dt, + i_domain_begin, + not_spectral_ ); + + checkHIPErrors( ::hipDeviceSynchronize() ); +#elif defined ( __NVCC__ ) + //double *device_Jx = smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ) ; + //printf("testing device Jx:, %p \n", device_Jx); + /*for (int i=0; i>> + ( + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jy ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jz ), + Jx_size, Jy_size, Jz_size, + device_particle_position_x, + device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_bin_index ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_invgf_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_iold_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_deltaold_ ), + inv_cell_volume, + dx_inv, + dx_ov_dt, + i_domain_begin, + not_spectral_ + ); + checkHIPErrors( ::cudaDeviceSynchronize() ); +#endif + } + + //static inline + void + currentAndDensityDepositionKernel1D( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + double *__restrict__ host_rho, + int Jx_size, + int Jy_size, + int Jz_size, + int rho_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ) + { + // & because one 1D ; 2 because of 2nd order interpolation + SMILEI_ASSERT( Params::getGPUClusterWidth( 1 ) != -1 && + Params::getGPUClusterGhostCellBorderWidth( 2 ) != -1 ); + + const ::dim3 kGridDimension { static_cast( x_dimension_bin_count ), 1, 1 }; + + static constexpr std::size_t kWorkgroupSize = 128; + const ::dim3 kBlockDimension{ static_cast( kWorkgroupSize ), 1, 1 }; + + //printf("ClusterWidth %d clusterGhostCellBorderWidth %d x_dimension_bin_count %d \n",Params::getGPUClusterWidth( 1), Params::getGPUClusterGhostCellBorderWidth( 2), x_dimension_bin_count); + + // NOTE: On cards lacking hardware backed Binary64 atomic operations, + // falling back to Binary32 (supposing hardware support for atomic + // operations) can lead to drastic performance improvement. + // One just need to assign 'float' to ReductionFloat. + // + using ComputeFloat = double; + using ReductionFloat = double; + auto KernelFunction = kernel::DepositCurrentAndDensity_1D_Order2; +#if defined ( __HIP__ ) + hipLaunchKernelGGL( KernelFunction, + kGridDimension, + kBlockDimension, + 0, // Shared memory + 0, // Stream + // Kernel arguments + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jy ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jz ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_rho ), + Jx_size, Jy_size, Jz_size, rho_size, + device_particle_position_x, + device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_bin_index ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_invgf_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_iold_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_deltaold_ ), + inv_cell_volume, + dx_inv, + dx_ov_dt, + i_domain_begin, + not_spectral_ ); + + checkHIPErrors( ::hipDeviceSynchronize() ); +#elif defined ( __NVCC__ ) + //printf("device bin index in projector cuda: %d \n",*host_bin_index); + //for(int i=0; i<*host_bin_index;++i) + // std::cout<<"in projector cuda, device_particle_position_x[i]"<< device_particle_position_x[i]<>> + ( + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jy ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jz ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_rho ), + Jx_size, Jy_size, Jz_size, rho_size, + device_particle_position_x, + device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_bin_index ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_invgf_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_iold_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_deltaold_ ), + inv_cell_volume, + dx_inv, + dx_ov_dt, + i_domain_begin, + not_spectral_ + ); + checkHIPErrors( ::cudaDeviceSynchronize() ); +#endif + } + +} // namespace cudahip1D + + diff --git a/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h new file mode 100755 index 000000000..37cabb963 --- /dev/null +++ b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h @@ -0,0 +1,71 @@ +//! HIP CUDA implementation + +#ifndef Projector1D2OrderGPUKernelCUDAHIP_H +#define Projector1D2OrderGPUKernelCUDAHIP_H + +#if defined( SMILEI_ACCELERATOR_MODE ) + +#if defined( __HIP__ ) + #include +#elif defined( __NVCC__ ) + #include + #include +#endif + +#include "Params.h" +#include "gpu.h" + +namespace cudahip1d { + +void currentDepositionKernel1D( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + int Jx_size, + int Jy_size, + int Jz_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ); + +void currentAndDensityDepositionKernel1D( + double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + double *__restrict__ host_rho, + int Jx_size, + int Jy_size, + int Jz_size, + int rho_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ); + +} // namespace cudahip1d + +#endif +#endif + diff --git a/src/Projector/Projector1D4Order.cpp b/src/Projector/Projector1D4Order.cpp index e78ddea67..ea4eafa4a 100755 --- a/src/Projector/Projector1D4Order.cpp +++ b/src/Projector/Projector1D4Order.cpp @@ -19,11 +19,11 @@ Projector1D4Order::Projector1D4Order( Params ¶ms, Patch *patch ) : Projector1D( params, patch ) { dx_inv_ = 1.0/params.cell_length[0]; - dx_ov_dt = params.cell_length[0] / params.timestep; + dx_ov_dt_ = params.cell_length[0] / params.timestep; //double defined for use in coefficients - index_domain_begin = patch->getCellStartingGlobalIndex( 0 ); + i_domain_begin_ = patch->getCellStartingGlobalIndex( 0 ); DEBUG( "cell_length "<< params.cell_length[0] ); @@ -43,7 +43,7 @@ void Projector1D4Order::currents( double *Jx, double *Jy, double *Jz, Particles int ip_m_ipo; double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); double xjn, xj_m_xipo, xj_m_xipo2, xj_m_xipo3, xj_m_xipo4, xj_m_xip, xj_m_xip2, xj_m_xip3, xj_m_xip4; - double crx_p = charge_weight*dx_ov_dt; // current density for particle moving in the x-direction + double crx_p = charge_weight*dx_ov_dt_; // current density for particle moving in the x-direction double cry_p = charge_weight*particles.momentum( 1, ipart )*invgf; // current density in the y-direction of the macroparticle double crz_p = charge_weight*particles.momentum( 2, ipart )*invgf; // current density allow the y-direction of the macroparticle double S0[7], S1[7], Wl[7], Wt[7], Jx_p[7]; // arrays used for the Esirkepov projection method @@ -82,7 +82,7 @@ void Projector1D4Order::currents( double *Jx, double *Jy, double *Jz, Particles // coefficients 2nd order interpolation on 5 nodes ipo = *iold; // index of the central node - ip_m_ipo = ip-ipo-index_domain_begin; + ip_m_ipo = ip-ipo-i_domain_begin_; S1[ip_m_ipo+1] = dble_1_ov_384 - dble_1_ov_48 * xj_m_xip + dble_1_ov_16 * xj_m_xip2 - dble_1_ov_12 * xj_m_xip3 + dble_1_ov_24 * xj_m_xip4; S1[ip_m_ipo+2] = dble_19_ov_96 - dble_11_ov_24 * xj_m_xip + dble_1_ov_4 * xj_m_xip2 + dble_1_ov_6 * xj_m_xip3 - dble_1_ov_6 * xj_m_xip4; @@ -125,7 +125,7 @@ void Projector1D4Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, int ip_m_ipo; double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); double xjn, xj_m_xipo, xj_m_xipo2, xj_m_xipo3, xj_m_xipo4, xj_m_xip, xj_m_xip2, xj_m_xip3, xj_m_xip4; - double crx_p = charge_weight*dx_ov_dt; // current density for particle moving in the x-direction + double crx_p = charge_weight*dx_ov_dt_; // current density for particle moving in the x-direction double cry_p = charge_weight*particles.momentum( 1, ipart )*invgf; // current density in the y-direction of the macroparticle double crz_p = charge_weight*particles.momentum( 2, ipart )*invgf; // current density allow the y-direction of the macroparticle double S0[7], S1[7], Wl[7], Wt[7], Jx_p[7]; // arrays used for the Esirkepov projection method @@ -164,7 +164,7 @@ void Projector1D4Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, // coefficients 2nd order interpolation on 5 nodes ipo = *iold; // index of the central node - ip_m_ipo = ip-ipo-index_domain_begin; + ip_m_ipo = ip-ipo-i_domain_begin_; S1[ip_m_ipo+1] = dble_1_ov_384 - dble_1_ov_48 * xj_m_xip + dble_1_ov_16 * xj_m_xip2 - dble_1_ov_12 * xj_m_xip3 + dble_1_ov_24 * xj_m_xip4; S1[ip_m_ipo+2] = dble_19_ov_96 - dble_11_ov_24 * xj_m_xip + dble_1_ov_4 * xj_m_xip2 + dble_1_ov_6 * xj_m_xip3 - dble_1_ov_6 * xj_m_xip4; @@ -253,7 +253,7 @@ void Projector1D4Order::basic( double *rhoj, Particles &particles, unsigned int S1[4] = dble_19_ov_96 + dble_11_ov_24 * xj_m_xip + dble_1_ov_4 * xj_m_xip2 - dble_1_ov_6 * xj_m_xip3 - dble_1_ov_6 * xj_m_xip4; S1[5] = dble_1_ov_384 + dble_1_ov_48 * xj_m_xip + dble_1_ov_16 * xj_m_xip2 + dble_1_ov_12 * xj_m_xip3 + dble_1_ov_24 * xj_m_xip4; - ip -= index_domain_begin + 3 + bin_shift ; + ip -= i_domain_begin_ + 3 + bin_shift ; // 4th order projection for the charge density // At the 4th order, oversize = 3. @@ -299,7 +299,7 @@ void Projector1D4Order::ionizationCurrents( Field *Jx, Field *Jy, Field *Jz, Par xjmxi3 = xjmxi2*xjmxi; // cube xjmxi4 = xjmxi2*xjmxi2; // fourth-power - i -= index_domain_begin; + i -= i_domain_begin_; im2 = i-2; im1 = i-1; ip1 = i+1; @@ -326,7 +326,7 @@ void Projector1D4Order::ionizationCurrents( Field *Jx, Field *Jy, Field *Jz, Par xjmxi = xjn - ( double )i; // normalized distance to the nearest grid point xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the nearest grid point - i -= index_domain_begin; + i -= i_domain_begin_; im2 = i-2; im1 = i-1; ip1 = i+1; @@ -476,7 +476,7 @@ void Projector1D4Order::ionizationCurrentsForTasks( double *b_Jx, double *b_Jy, Sxd[3] = dble_19_ov_96 + dble_11_ov_24 * xpmxid + dble_1_ov_4 * xpmxid2 - dble_1_ov_6 * xpmxid3 - dble_1_ov_6 * xpmxid4; Sxd[4] = dble_1_ov_384 + dble_1_ov_48 * xpmxid + dble_1_ov_16 * xpmxid2 + dble_1_ov_12 * xpmxid3 + dble_1_ov_24 * xpmxid4; - ip -= index_domain_begin+bin_shift; + ip -= i_domain_begin_+bin_shift; // id -= i_domain_begin; for (unsigned int i=0 ; i<5 ; i++) { diff --git a/src/Projector/Projector1D4Order.h b/src/Projector/Projector1D4Order.h index 6cd570d62..3ef38a7c7 100755 --- a/src/Projector/Projector1D4Order.h +++ b/src/Projector/Projector1D4Order.h @@ -33,7 +33,6 @@ class Projector1D4Order : public Projector1D void susceptibility( ElectroMagn *EMfields, Particles &particles, double species_mass, SmileiMPI *smpi, int istart, int iend, int ithread, int icell = 0, int ipart_ref = 0 ) override final; private: - double dx_ov_dt; static constexpr double dble_1_ov_384 = 1.0/384.0; static constexpr double dble_1_ov_48 = 1.0/48.0 ; static constexpr double dble_1_ov_16 = 1.0/16.0 ; diff --git a/src/Projector/Projector2D2OrderGPU.cpp b/src/Projector/Projector2D2OrderGPU.cpp index cfe20eb7d..82e1fd0e2 100755 --- a/src/Projector/Projector2D2OrderGPU.cpp +++ b/src/Projector/Projector2D2OrderGPU.cpp @@ -21,7 +21,7 @@ Projector2D2OrderGPU::Projector2D2OrderGPU( Params ¶meters, Patch *a_patch ) // initialize it's member variable) we better initialize // Projector2D2OrderGPU's member variable after explicitly initializing // Projector2D. - not_spectral = !parameters.is_pxr; + not_spectral_ = !parameters.is_pxr; dt = parameters.timestep; dts2 = dt / 2.0; dts4 = dts2 / 2.0; @@ -69,10 +69,10 @@ currentDepositionKernel2DOnDevice( double *__restrict__ Jx, int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ); + int not_spectral_ ); extern "C" void -currentAndDensityDepositionKernelOnDevice( double *__restrict__ Jx, +currentAndDensityDepositionKernel2DOnDevice( double *__restrict__ Jx, double *__restrict__ Jy, double *__restrict__ Jz, double *__restrict__ rho, @@ -99,7 +99,7 @@ currentAndDensityDepositionKernelOnDevice( double *__restrict__ Jx, int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ); + int not_spectral_ ); #endif @@ -130,7 +130,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy int j_domain_begin, int nprimy, double, - int not_spectral ) + int not_spectral_ ) { #if defined( SMILEI_ACCELERATOR_MODE )//SMILEI_ACCELERATOR_GPU_OMP ) currentDepositionKernel2DOnDevice( Jx, @@ -158,7 +158,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); #else SMILEI_ASSERT( false ); #endif @@ -191,10 +191,10 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy int j_domain_begin, int nprimy, double, - int not_spectral ) + int not_spectral_ ) { #if defined( SMILEI_ACCELERATOR_MODE )//SMILEI_ACCELERATOR_GPU_OMP ) - currentAndDensityDepositionKernelOnDevice( Jx, + currentAndDensityDepositionKernel2DOnDevice( Jx, Jy, Jz, rho, @@ -221,7 +221,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); #else SMILEI_ASSERT( false ); #endif @@ -368,7 +368,7 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, // i_domain_begin_, j_domain_begin_, // nprimy, // one_third, - // not_spectral ); + // not_spectral_ ); // } // Does not compute Rho ! @@ -385,7 +385,7 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, nprimy, one_third, - not_spectral ); + not_spectral_ ); } else { // If no field diagnostics this timestep, then the projection is done directly on the total arrays @@ -401,7 +401,7 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, // i_domain_begin_, j_domain_begin_, // nprimy, // one_third, - // not_spectral ); + // not_spectral_ ); // } } else { @@ -420,7 +420,7 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, nprimy, one_third, - not_spectral ); + not_spectral_ ); } } } @@ -467,7 +467,7 @@ void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) //{ // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) // naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -490,7 +490,7 @@ void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, // dx_ov_dt, dy_ov_dt, // i_domain_begin, j_domain_begin, // nprimy, -// not_spectral ); +// not_spectral_ ); //} // // @@ -524,7 +524,7 @@ void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) //{ // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) // naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -547,7 +547,7 @@ void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, // dx_ov_dt, dy_ov_dt, // i_domain_begin, j_domain_begin, // nprimy, -// not_spectral ); +// not_spectral_ ); //} //#endif diff --git a/src/Projector/Projector2D2OrderGPU.h b/src/Projector/Projector2D2OrderGPU.h index 9a799f9b5..15b6d2afa 100755 --- a/src/Projector/Projector2D2OrderGPU.h +++ b/src/Projector/Projector2D2OrderGPU.h @@ -78,7 +78,7 @@ class Projector2D2OrderGPU : public Projector2D double dt; double dts2; double dts4; - int not_spectral; + int not_spectral_; unsigned int x_dimension_bin_count_; unsigned int y_dimension_bin_count_; }; diff --git a/src/Projector/Projector2D2OrderGPUKernel.cpp b/src/Projector/Projector2D2OrderGPUKernel.cpp old mode 100644 new mode 100755 index 8f38f52fe..2f36ae2b0 --- a/src/Projector/Projector2D2OrderGPUKernel.cpp +++ b/src/Projector/Projector2D2OrderGPUKernel.cpp @@ -33,7 +33,7 @@ currentDepositionKernel2DOnDevice( double *__restrict__ host_Jx, int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { //#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) //naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -56,14 +56,14 @@ currentDepositionKernel2DOnDevice( double *__restrict__ host_Jx, dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); } //! Project global current and charge densities (EMfields->Jx_/Jy_/Jz_/rho_) //! extern "C" void -currentAndDensityDepositionKernelOnDevice( double *__restrict__ host_Jx, +currentAndDensityDepositionKernel2DOnDevice( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, double *__restrict__ host_rho, @@ -90,14 +90,14 @@ currentAndDensityDepositionKernelOnDevice( double *__restrict__ host_Jx, int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { //#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) //naive:: // the naive, OMP version serves as a reference along with the CPU version //#else cudahip2d:: //#endif - currentAndDensityDepositionKernel( host_Jx, host_Jy, host_Jz, host_rho, + currentAndDensityDepositionKernel2D( host_Jx, host_Jy, host_Jz, host_rho, Jx_size, Jy_size, Jz_size, rho_size, device_particle_position_x, device_particle_position_y, device_particle_momentum_z, @@ -113,7 +113,7 @@ currentAndDensityDepositionKernelOnDevice( double *__restrict__ host_Jx, dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); } #endif diff --git a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu old mode 100644 new mode 100755 index 666a409f4..ad966328a --- a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu +++ b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu @@ -20,20 +20,20 @@ #if defined( __HIP__ ) // HIP compiler support enabled (for .cu files) - #else - #define PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION 1 - #endif +#else + #define PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION 1 +#endif - #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) - #include +#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) + #include - #include "Tools.h" - #else - #include + #include "Tools.h" +#else + #include - #include "Params.h" - #include "gpu.h" - #endif + #include "Params.h" + #include "gpu.h" +#endif // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) @@ -65,7 +65,7 @@ // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) // { // // The OMP implementation is NOT bin aware. As per the precondition on // // host_bin_index, index zero always contains the number of particles. @@ -185,7 +185,7 @@ // tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); // // SMILEI_ACCELERATOR_ATOMIC -// Jy[iloc + j + not_spectral * ( /* i + */ ipo )] += tmp; +// Jy[iloc + j + not_spectral_ * ( /* i + */ ipo )] += tmp; // // SMILEI_ACCELERATOR_ATOMIC // Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] /* + Sx0[i] */ ) + @@ -209,7 +209,7 @@ // Jx[iloc + j] += tmpJx[j]; // tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); // SMILEI_ACCELERATOR_ATOMIC -// Jy[iloc + j + not_spectral * ( i + ipo )] += tmp; +// Jy[iloc + j + not_spectral_ * ( i + ipo )] += tmp; // // SMILEI_ACCELERATOR_ATOMIC // Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] + Sx0[i] ) + @@ -248,7 +248,7 @@ // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) // { // // The OMP implementation is NOT bin aware. As per the precondition on // // host_bin_index, index zero always contains the number of particles. @@ -372,7 +372,7 @@ // tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); // // SMILEI_ACCELERATOR_ATOMIC -// Jy[iloc + j + not_spectral * ( /* i + */ ipo )] += tmp; +// Jy[iloc + j + not_spectral_ * ( /* i + */ ipo )] += tmp; // // SMILEI_ACCELERATOR_ATOMIC // Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] /* + Sx0[i] */ ) + @@ -407,7 +407,7 @@ // tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); // // SMILEI_ACCELERATOR_ATOMIC -// Jy[iloc + j + not_spectral * ( i + ipo )] += tmp; +// Jy[iloc + j + not_spectral_ * ( i + ipo )] += tmp; // // SMILEI_ACCELERATOR_ATOMIC // Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] + Sx0[i] ) + @@ -567,7 +567,7 @@ namespace cudahip2d { int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { // TODO(Etienne M): refactor this function. Break it into smaller // pieces (lds init/store, coeff computation, deposition etc..) @@ -867,7 +867,7 @@ namespace cudahip2d { // These atomics are basically free (very few of them). atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[scratch_space_index] ) ); - atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); atomic::GDS::AddNoReturn( &device_Jz[global_memory_index], static_cast( Jz_scratch_space[scratch_space_index] ) ); } } // end DepositCurrent @@ -903,7 +903,7 @@ namespace cudahip2d { int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { // TODO(Etienne M): refactor this function. Break it into smaller // pieces (lds init/store, coeff computation, deposition etc..) @@ -1146,7 +1146,7 @@ namespace cudahip2d { // These atomics are basically free (very few of them). atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[scratch_space_index] ) ); - atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); atomic::GDS::AddNoReturn( &device_Jz[global_memory_index], static_cast( Jz_scratch_space[scratch_space_index] ) ); atomic::GDS::AddNoReturn( &device_rho[global_memory_index], static_cast( rho_scratch_space[scratch_space_index] ) ); } @@ -1181,7 +1181,7 @@ namespace cudahip2d { int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { SMILEI_ASSERT( Params::getGPUClusterWidth( 2 /* 2D */ ) != -1 && Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); @@ -1229,7 +1229,7 @@ namespace cudahip2d { dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); checkHIPErrors( ::hipDeviceSynchronize() ); #elif defined ( __NVCC__ ) @@ -1258,7 +1258,7 @@ namespace cudahip2d { dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral + not_spectral_ ); checkHIPErrors( ::cudaDeviceSynchronize() ); #endif @@ -1266,7 +1266,7 @@ namespace cudahip2d { //static inline void - currentAndDensityDepositionKernel( double *__restrict__ host_Jx, + currentAndDensityDepositionKernel2D( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, double *__restrict__ host_rho, @@ -1293,7 +1293,7 @@ namespace cudahip2d { int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { SMILEI_ASSERT( Params::getGPUClusterWidth( 2 /* 2D */ ) != -1 && Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); @@ -1341,7 +1341,7 @@ namespace cudahip2d { dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); checkHIPErrors( ::hipDeviceSynchronize() ); #elif defined ( __NVCC__ ) @@ -1371,7 +1371,7 @@ namespace cudahip2d { dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral + not_spectral_ ); checkHIPErrors( ::cudaDeviceSynchronize() ); #endif @@ -1409,7 +1409,7 @@ namespace cudahip2d { // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) //{ // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) // naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -1432,7 +1432,7 @@ namespace cudahip2d { // dx_ov_dt, dy_ov_dt, // i_domain_begin, j_domain_begin, // nprimy, -// not_spectral ); +// not_spectral_ ); //} // ////! Project global current and charge densities (EMfields->Jx_/Jy_/Jz_/rho_) @@ -1465,7 +1465,7 @@ namespace cudahip2d { // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) //{ // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) // naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -1488,6 +1488,6 @@ namespace cudahip2d { // dx_ov_dt, dy_ov_dt, // i_domain_begin, j_domain_begin, // nprimy, -// not_spectral ); +// not_spectral_ ); //} diff --git a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h old mode 100644 new mode 100755 index d607a4ab4..7aae8d2c6 --- a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h +++ b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h @@ -20,8 +20,7 @@ namespace cudahip2d { //static -void - currentDepositionKernel2D( double *__restrict__ host_Jx, +void currentDepositionKernel2D( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, int Jx_size, @@ -46,11 +45,10 @@ void int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ); + int not_spectral_ ); //static -inline void - currentAndDensityDepositionKernel( +void currentAndDensityDepositionKernel2D( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, @@ -78,7 +76,7 @@ inline void int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ); + int not_spectral_ ); } // namespace cudahip2d diff --git a/src/Projector/Projector3D2OrderGPU.cpp b/src/Projector/Projector3D2OrderGPU.cpp index 39342b204..910fc7d14 100755 --- a/src/Projector/Projector3D2OrderGPU.cpp +++ b/src/Projector/Projector3D2OrderGPU.cpp @@ -25,7 +25,7 @@ Projector3D2OrderGPU::Projector3D2OrderGPU( Params ¶meters, Patch *a_patch ) // initialize it's member variable) we better initialize // Projector2D2OrderGPU's member variable after explicitly initializing // Projector2D. - not_spectral = !parameters.is_pxr; + not_spectral_ = !parameters.is_pxr; dt = parameters.timestep; dts2 = dt / 2.0; dts4 = dts2 / 2.0; @@ -83,7 +83,7 @@ currentDeposition3DOnDevice( double *__restrict__ Jx, int k_domain_begin, int nprimy, int nprimz, - int not_spectral ); + int not_spectral_ ); extern "C" void densityDeposition3DOnDevice( @@ -114,7 +114,7 @@ densityDeposition3DOnDevice( int k_domain_begin, int nprimy, int nprimz, - int not_spectral ); + int not_spectral_ ); #endif namespace { // Unnamed namespace == static == internal linkage == no exported symbols @@ -148,7 +148,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy int nprimy, int nprimz, double, - int not_spectral ) + int not_spectral_ ) { #if defined( SMILEI_ACCELERATOR_MODE ) currentDeposition3DOnDevice( Jx, @@ -181,7 +181,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral ); + not_spectral_ ); #else SMILEI_ASSERT( false ); #endif @@ -213,7 +213,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy int nprimy, int nprimz, double, - int not_spectral ) + int not_spectral_ ) { #if defined( SMILEI_ACCELERATOR_MODE ) densityDeposition3DOnDevice( @@ -243,7 +243,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral ); + not_spectral_ ); #else SMILEI_ASSERT( false ); #endif @@ -401,7 +401,7 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, k_domain_begin_, nprimy, nprimz, one_third, - not_spectral ); + not_spectral_ ); double *const __restrict__ b_rho = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->data() : EMfields->rho_->data(); unsigned int rho_size = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->size() : EMfields->rho_->size(); @@ -416,7 +416,7 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, k_domain_begin_, nprimy, nprimz, one_third, - not_spectral ); + not_spectral_ ); // If requested performs then the charge density deposition } else { @@ -440,7 +440,7 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, k_domain_begin_, nprimy, nprimz, one_third, - not_spectral ); + not_spectral_ ); } // TODO(Etienne M): DIAGS. Find a way to get rho. We could: diff --git a/src/Projector/Projector3D2OrderGPU.h b/src/Projector/Projector3D2OrderGPU.h index 2fac2402e..5aa1927ac 100755 --- a/src/Projector/Projector3D2OrderGPU.h +++ b/src/Projector/Projector3D2OrderGPU.h @@ -78,7 +78,7 @@ class Projector3D2OrderGPU : public Projector3D double dt; double dts2; double dts4; - int not_spectral; + int not_spectral_; unsigned int x_dimension_bin_count_; unsigned int y_dimension_bin_count_; unsigned int z_dimension_bin_count_; diff --git a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu old mode 100644 new mode 100755 index 195a02667..4c6e07224 --- a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu +++ b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu @@ -162,7 +162,7 @@ namespace cudahip { int k_domain_begin, int nprimy, int nprimz, - int not_spectral ) + int not_spectral_ ) { // Potential future work for optimization: Break the kernel into smaller // pieces (lds init/store, coeff computation, deposition etc..) @@ -501,8 +501,8 @@ namespace cudahip { // These atomics are basically free (very few of them). atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[field_index] ) ); - atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral * global_x_scratch_space_coordinate * nprimz], static_cast( Jy_scratch_space[field_index] ) ); - atomic::GDS::AddNoReturn( &device_Jz[global_memory_index + /* We handle the FTDT/picsar */ not_spectral * (global_x_scratch_space_coordinate * nprimy + global_y_scratch_space_coordinate)], static_cast( Jz_scratch_space[field_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * global_x_scratch_space_coordinate * nprimz], static_cast( Jy_scratch_space[field_index] ) ); + atomic::GDS::AddNoReturn( &device_Jz[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * (global_x_scratch_space_coordinate * nprimy + global_y_scratch_space_coordinate)], static_cast( Jz_scratch_space[field_index] ) ); } } // end DepositCurrent @@ -536,7 +536,7 @@ namespace cudahip { int k_domain_begin, int nprimy, int nprimz, - int not_spectral ) + int not_spectral_ ) { // TODO(Etienne M): refactor this function. Break it into smaller // pieces (lds init/store, coeff computation, deposition etc..) @@ -716,7 +716,7 @@ namespace cudahip { int k_domain_begin, int nprimy, int nprimz, - int not_spectral ) + int not_spectral_ ) { SMILEI_ASSERT( Params::getGPUClusterWidth( 3 /* 2D */ ) != -1 && Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); @@ -767,7 +767,7 @@ namespace cudahip { dx_ov_dt, dy_ov_dt, dz_ov_dt, i_domain_begin, j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral + not_spectral_ ); checkHIPErrors( ::hipDeviceSynchronize() ); @@ -799,7 +799,7 @@ namespace cudahip { dx_ov_dt, dy_ov_dt, dz_ov_dt, i_domain_begin, j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral + not_spectral_ ); checkHIPErrors( ::cudaDeviceSynchronize() ); #endif @@ -836,7 +836,7 @@ namespace cudahip { int k_domain_begin, int nprimy, int nprimz, - int not_spectral ) + int not_spectral_ ) { SMILEI_ASSERT( Params::getGPUClusterWidth( 3 /* 2D */ ) != -1 && Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); @@ -886,7 +886,7 @@ namespace cudahip { dx_ov_dt, dy_ov_dt, dz_ov_dt, i_domain_begin, j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral ); + not_spectral_ ); checkHIPErrors( ::hipDeviceSynchronize() ); #elif defined ( __NVCC__ ) @@ -914,7 +914,7 @@ namespace cudahip { dx_ov_dt, dy_ov_dt, dz_ov_dt, i_domain_begin, j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral + not_spectral_ ); checkHIPErrors( ::cudaDeviceSynchronize() ); #endif diff --git a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h old mode 100644 new mode 100755 index 94368f4dd..eba3f0d0d --- a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h +++ b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h @@ -17,9 +17,8 @@ #include "gpu.h" namespace cudahip { -//static -inline void - currentDepositionKernel3D( double *__restrict__ host_Jx, +//static inline +void currentDepositionKernel3D( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, int Jx_size, @@ -50,11 +49,10 @@ inline void int k_domain_begin, int nprimy, int nprimz, - int not_spectral ); + int not_spectral_ ); -//static -inline void - densityDepositionKernel3D( +//static inline +void densityDepositionKernel3D( double *__restrict__ host_rho, int rho_size, const double *__restrict__ device_particle_position_x, @@ -82,7 +80,7 @@ inline void int k_domain_begin, int nprimy, int nprimz, - int not_spectral ); + int not_spectral_ ); } // namespace cudahip diff --git a/src/SmileiMPI/SmileiMPI.cpp b/src/SmileiMPI/SmileiMPI.cpp index c35a69fe9..262a57b34 100755 --- a/src/SmileiMPI/SmileiMPI.cpp +++ b/src/SmileiMPI/SmileiMPI.cpp @@ -1319,9 +1319,9 @@ void SmileiMPI::isend( ElectroMagn *EM, int to, int &irequest, vector( EM->emBoundCond[bcId] ) ) { ElectroMagnBC1D_SM *embc = static_cast( EM->emBoundCond[bcId] ); - MPI_Isend( &( embc->By_val ), 1, MPI_DOUBLE, to, tag+irequest, MPI_COMM_WORLD, &requests[irequest] ); + MPI_Isend( &( embc->By_val_ ), 1, MPI_DOUBLE, to, tag+irequest, MPI_COMM_WORLD, &requests[irequest] ); irequest++; - MPI_Isend( &( embc->Bz_val ), 1, MPI_DOUBLE, to, tag+irequest, MPI_COMM_WORLD, &requests[irequest] ); + MPI_Isend( &( embc->Bz_val_ ), 1, MPI_DOUBLE, to, tag+irequest, MPI_COMM_WORLD, &requests[irequest] ); irequest++; } else if( dynamic_cast( EM->emBoundCond[bcId] ) ) { // BCs at the x-border @@ -1855,9 +1855,9 @@ void SmileiMPI::recv( ElectroMagn *EM, int from, int &tag, bool recv_xmin_bc ) if( dynamic_cast( EM->emBoundCond[bcId] ) ) { ElectroMagnBC1D_SM *embc = static_cast( EM->emBoundCond[bcId] ); MPI_Status status; - MPI_Recv( &( embc->By_val ), 1, MPI_DOUBLE, from, tag, MPI_COMM_WORLD, &status ); + MPI_Recv( &( embc->By_val_ ), 1, MPI_DOUBLE, from, tag, MPI_COMM_WORLD, &status ); tag++; - MPI_Recv( &( embc->Bz_val ), 1, MPI_DOUBLE, from, tag, MPI_COMM_WORLD, &status ); + MPI_Recv( &( embc->Bz_val_ ), 1, MPI_DOUBLE, from, tag, MPI_COMM_WORLD, &status ); tag++; } else if( dynamic_cast( EM->emBoundCond[bcId] ) ) { // BCs at the x-border From 530529d6bc2b9b683de311df7af8e33332aa5d14 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Thu, 18 Apr 2024 21:39:27 +0200 Subject: [PATCH 10/54] fix analysis --- .../validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py b/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py index ee807d65b..da25c961c 100644 --- a/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py +++ b/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py @@ -247,7 +247,7 @@ def adaptive_error(values, statistics, thresholds): thresholds = {} thresholds["points"] = np.array([0. ,10 ,100,1000]) -thresholds["factor"] = np.array([1e9, 1.,0.5, 0.2]) +thresholds["factor"] = np.array([1e9, 1.,0.6, 0.2]) Validate("Average gamma for the electrons vs time", average_gamma["electron"], adaptive_error(average_gamma["electron"], Nelectron, thresholds)) Validate("Average gamma for the positrons vs time", average_gamma["positron"], adaptive_error(average_gamma["positron"], Npositron, thresholds)) From ff0266ec8be29aee51ce53e63c9b7d78d821f261 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Fri, 19 Apr 2024 10:20:49 +0200 Subject: [PATCH 11/54] more --- .../validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py b/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py index da25c961c..8d5b8ddb1 100644 --- a/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py +++ b/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py @@ -247,7 +247,7 @@ def adaptive_error(values, statistics, thresholds): thresholds = {} thresholds["points"] = np.array([0. ,10 ,100,1000]) -thresholds["factor"] = np.array([1e9, 1.,0.6, 0.2]) +thresholds["factor"] = np.array([1e9, 1.,0.7, 0.2]) Validate("Average gamma for the electrons vs time", average_gamma["electron"], adaptive_error(average_gamma["electron"], Nelectron, thresholds)) Validate("Average gamma for the positrons vs time", average_gamma["positron"], adaptive_error(average_gamma["positron"], Npositron, thresholds)) From 73b7de886b88a1183e2febafcb8e981354823d77 Mon Sep 17 00:00:00 2001 From: cprouveur Date: Fri, 19 Apr 2024 17:15:30 +0200 Subject: [PATCH 12/54] added the appropriate field1D and projector factory source files + cleanup ; no results on adastra, probably an issue with openmp --- src/ElectroMagnSolver/MA_Solver1D_norm.cpp | 43 ------ src/ElectroMagnSolver/MF_Solver1D_Yee.cpp | 29 ---- src/Field/Field1D.cpp | 127 ++++++++++++++++-- src/Interpolator/Interpolator1D2Order.cpp | 93 +------------ src/Interpolator/Interpolator1D2Order.h | 2 +- src/Particles/nvidiaParticles.cu | 13 +- src/Projector/Projector1D2OrderGPU.cpp | 93 +------------ .../Projector1D2OrderGPUKernelCUDAHIP.cu | 33 ----- src/Projector/Projector3D2OrderGPUKernel.cpp | 3 - .../Projector3D2OrderGPUKernelCUDAHIP.cu | 3 - src/Projector/ProjectorFactory.h | 7 +- 11 files changed, 129 insertions(+), 317 deletions(-) diff --git a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp index 9b9f0d53d..4ef123b2d 100755 --- a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp +++ b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp @@ -15,17 +15,8 @@ MA_Solver1D_norm::~MA_Solver1D_norm() void MA_Solver1D_norm::operator()( ElectroMagn *fields ) { - { const unsigned int nx_p = fields->dimPrim[0]; const unsigned int nx_d = fields->dimDual[0]; - /*Field1D *Ex1D = static_cast( fields->Ex_ ); - Field1D *Ey1D = static_cast( fields->Ey_ ); - Field1D *Ez1D = static_cast( fields->Ez_ ); - Field1D *By1D = static_cast( fields->By_ ); - Field1D *Bz1D = static_cast( fields->Bz_ ); - Field1D *Jx1D = static_cast( fields->Jx_ ); - Field1D *Jy1D = static_cast( fields->Jy_ ); - Field1D *Jz1D = static_cast( fields->Jz_ );*/ double *const __restrict__ Ex1D = fields->Ex_->data(); // [x] : dual in x primal in y,z double *const __restrict__ Ey1D = fields->Ey_->data(); // [x] : dual in y primal in x,z @@ -37,18 +28,6 @@ void MA_Solver1D_norm::operator()( ElectroMagn *fields ) const double *const __restrict__ Jy1D = fields->Jy_->data(); // [x] : dual in y primal in x,z const double *const __restrict__ Jz1D = fields->Jz_->data(); // [x] : dual in z primal in x,y - { - fields->Ex_->copyFromDeviceToHost(); - fields->Ey_->copyFromDeviceToHost(); - fields->Ez_->copyFromDeviceToHost(); - fields->Jx_->copyFromDeviceToHost(); - fields->Jy_->copyFromDeviceToHost(); - fields->Jz_->copyFromDeviceToHost(); - } - std::cout<< "printing before in MA solver ex, ey and ez for nx_d="<Ex_->copyFromDeviceToHost(); - fields->Ey_->copyFromDeviceToHost(); - fields->Ez_->copyFromDeviceToHost(); - } - } - // to be deleted - { - const unsigned int nx_p = fields->dimPrim[0]; - const unsigned int nx_d = fields->dimDual[0]; - double *const __restrict__ Ex1D = fields->Ex_->data(); // [x] : dual in x primal in y,z - double *const __restrict__ Ey1D = fields->Ey_->data(); // [x] : dual in y primal in x,z - double *const __restrict__ Ez1D = fields->Ez_->data(); // [x] : dual in z primal in x,y - - std::cout<< "printing after in MA solver ex, ey and ez for nx_d="<dimPrim[0]; const unsigned int nx_d = fields->dimDual[0]; - // Static-cast of the fields - /*Field1D* Ey1D; - Field1D* Ez1D; - if (isEFilterApplied) { - Ey1D = static_cast(fields->filter_->Ey_[0]); - Ez1D = static_cast(fields->filter_->Ez_[0]); - } else { - Ey1D = static_cast(fields->Ey_); - Ez1D = static_cast(fields->Ez_); - }*/ const double *const __restrict__ Ey1D = isEFilterApplied ? fields->filter_->Ey_[0]->data() : fields->Ey_->data(); // [ix] : dual in y primal in x,z const double *const __restrict__ Ez1D = isEFilterApplied ? fields->filter_->Ez_[0]->data() : fields->Ez_->data();// [ix] : dual in z primal in x,y - //Field1D *By1D = static_cast( fields->By_ ); - //Field1D *Bz1D = static_cast( fields->Bz_ ); double *const __restrict__ By1D = fields->By_->data();// [ix] : dual in x,z primal in y double *const __restrict__ Bz1D = fields->Bz_->data();// [ix] : dual in x,y primal in z - // to be deleted - /*std::cout<< "printing before in FM solver by and bz for nx_d-1="<By_->copyFromDeviceToHost(); - fields->Bz_->copyFromDeviceToHost(); - } - std::cout<< "printing after in FM solver by and bz for nx_d-1="<allocateAndCopyFromHostToDevice(); + recvFields_[iDim * 2 + iNeighbor]->allocateAndCopyFromHostToDevice(); + } +#endif } else if( ghost_size != (int) sendFields_[iDim*2+iNeighbor]->dims_[iDim] ) { +#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) + ERROR( "To Do GPU : envelope" ); +#endif delete sendFields_[iDim*2+iNeighbor]; sendFields_[iDim*2+iNeighbor] = new Field1D(size); delete recvFields_[iDim*2+iNeighbor]; recvFields_[iDim*2+iNeighbor] = new Field1D(size); } } - void Field1D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) { std::vector size = dims_; @@ -267,13 +300,30 @@ void Field1D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) unsigned int NX = size[0]; - double* sub = sendFields_[iDim*2+iNeighbor]->data_; - double* field = data_; + double *__restrict__ sub = sendFields_[iDim*2+iNeighbor]->data_; + const double*__restrict__ field = data_; + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + // At initialization, this data is NOT on the GPU + const bool should_manipulate_gpu_memory = name[0] == 'B' && + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ); + SMILEI_ASSERT( smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( field ) == + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ) ); + const unsigned field_first = ix; + const unsigned field_last = ix + NX - 1; + #pragma omp target if( should_manipulate_gpu_memory ) + #pragma omp teams distribute parallel for +#elif defined( SMILEI_OPENACC_MODE ) + const int subSize = sendFields_[iDim*2+iNeighbor]->size(); + const int fSize = number_of_points_; + bool fieldName( (name.substr(0,1) == "B") ); + #pragma acc parallel present( field[0:fSize], sub[0:subSize] ) if (fieldName) + #pragma acc loop gang worker vector +#endif for( unsigned int i=0; i size = dims_; @@ -286,8 +336,25 @@ void Field1D::inject_fields_exch ( int iDim, int iNeighbor, int ghost_size ) unsigned int NX = size[0]; - double* sub = recvFields_[iDim*2+(iNeighbor+1)%2]->data_; - double* field = data_; + const double *__restrict__ sub = recvFields_[iDim*2+(iNeighbor+1)%2]->data_; + double *__restrict__ field = data_; + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + // At initialization, this data is NOT on the GPU + const bool should_manipulate_gpu_memory = name[0] == 'B' && + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ); + const unsigned field_first = ix; + const unsigned field_last = ix + NX - 1; + #pragma omp target if( should_manipulate_gpu_memory ) \ + map( tofrom : field [field_first:field_last - field_first] ) + #pragma omp teams distribute parallel for +#elif defined( SMILEI_OPENACC_MODE ) + int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); + const int fSize = number_of_points_; + bool fieldName( name.substr(0,1) == "B" ); + #pragma acc parallel present( field[0:fSize], sub[0:subSize] ) if (fieldName) + #pragma acc loop gang worker vector +#endif for( unsigned int i=0; idata_; - double* field = data_; + double *__restrict__ sub = sendFields_[iDim*2+iNeighbor]->data_; + const double *__restrict__ field = data_; + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + // At initialization, this data is NOT on the GPU + const bool should_manipulate_gpu_memory = (name[0] == 'J' || name[0] == 'R') && + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ); + const unsigned field_first = ix; + const unsigned field_last = ix + NX - 1; + #pragma omp target if( should_manipulate_gpu_memory ) \ + map( to : field [field_first:field_last - field_first] ) + #pragma omp teams distribute parallel for +#elif defined( SMILEI_OPENACC_MODE ) + const int subSize = sendFields_[iDim*2+iNeighbor]->size(); + const int fSize = number_of_points_; + bool fieldName( ((name.substr(0,1) == "J") || (name.substr(0,1) == "R") ) && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub )); + #pragma acc parallel copy(field[0:fSize]) present( sub[0:subSize] ) if (fieldName) + #pragma acc loop gang worker vector +#endif for( unsigned int i=0; i size = dims_; @@ -324,9 +407,27 @@ void Field1D::inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) unsigned int NX = size[0]; - double* sub = recvFields_[iDim*2+(iNeighbor+1)%2]->data_; - double* field = data_; + const double *__restrict__ sub = recvFields_[iDim*2+(iNeighbor+1)%2]->data_; + double *__restrict__ field = data_; + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + // At initialization, this data is NOT on the GPU + const bool should_manipulate_gpu_memory = (name[0] == 'J' || name[0] == 'R') && + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ); + const unsigned field_first = ix; + const unsigned field_last = ix + NX - 1; + #pragma omp target if( should_manipulate_gpu_memory ) \ + map( tofrom : field [field_first:field_last - field_first] ) + #pragma omp teams distribute parallel for +#elif defined( SMILEI_OPENACC_MODE ) + int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); + int fSize = number_of_points_; + bool fieldName( name.substr(0,1) == "J" || name.substr(0,1) == "R"); + #pragma acc parallel copy(field[0:fSize]) present( sub[0:subSize] ) if (fieldName) + #pragma acc loop gang worker vector +#endif for( unsigned int i=0; idynamics_Epart[ithread].data();//&( smpi->dynamics_Epart[ithread][0] ); - double *const __restrict__ BLoc = smpi->dynamics_Bpart[ithread].data();//&( smpi->dynamics_Bpart[ithread][0] ); + double *const __restrict__ ELoc = smpi->dynamics_Epart[ithread].data(); + double *const __restrict__ BLoc = smpi->dynamics_Bpart[ithread].data(); - int *const __restrict__ iold = smpi->dynamics_iold[ithread].data();//&( smpi->dynamics_iold[ithread][0] ); - double *const __restrict__ delta = smpi->dynamics_deltaold[ithread].data();//&( smpi->dynamics_deltaold[ithread][0] ); + int *const __restrict__ iold = smpi->dynamics_iold[ithread].data(); + double *const __restrict__ delta = smpi->dynamics_deltaold[ithread].data(); const double *const __restrict__ position_x = particles.getPtrPosition( 0 ); @@ -159,53 +158,8 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, const int last_index = *iend; double accdx_inv[2]; accdx_inv[0]= dx_inv_; - /*std::cout<< "printing before in interpolator ex, ey and ez then bx,by,bz" <Ex_->copyFromDeviceToHost(); - EMfields->Ey_->copyFromDeviceToHost(); - EMfields->Ez_->copyFromDeviceToHost(); - EMfields->Jx_->copyFromDeviceToHost(); - EMfields->Jy_->copyFromDeviceToHost(); - EMfields->Jz_->copyFromDeviceToHost(); - } - std::cout<< "printing before in interpolator after copyFromDeviceToHost ex, ey and ez then bx,by,bz" <dynamics_Epart[ithread] )[0*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Epart[ithread] )[1*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Epart[ithread] )[2*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[0*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[1*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[2*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_iold[ithread] )[0] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_deltaold[ithread] )[0]), nparts ); - - - - std::cout<<"print in interpolator fields wrapper eloc before computation and after CopyDeviceToHost"<use_BTIS3){ - //for (int ipart=*istart; ipart < *iend; ipart++){ #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target map( to : i_domain_begin_) is_device_ptr (position_x) #pragma omp teams distribute parallel for @@ -229,7 +183,6 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, #endif for( int ipart = first_index; ipart < last_index; ipart++ ) { // Normalized particle position - //double xpn = position_x[ipart] * dx_inv_;//particles.position( 0, ipart )*dx_inv_; const double xpn = position_x[ipart] * accdx_inv[0]; // Calculate coeffs int idx_p[1], idx_d[1]; @@ -297,11 +250,6 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, BzpartBTIS3 [first_index:interpolation_range_size],\ // ? - /* Field1D *By1D_mBTIS3 = static_cast( EMfields->By_mBTIS3 ); - Field1D *Bz1D_mBTIS3 = static_cast( EMfields->Bz_mBTIS3 ); - double *BypartBTIS3 = &( smpi->dynamics_Bpart_yBTIS3[ithread][0] ); - double *BzpartBTIS3 = &( smpi->dynamics_Bpart_zBTIS3[ithread][0] );*/ - for (int ipart=*istart; ipart < *iend; ipart++){ // Normalized particle position @@ -343,39 +291,6 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, #endif } // end with B-TIS interpolation - /*{ - EMfields->Ex_->copyFromDeviceToHost(); - EMfields->Ey_->copyFromDeviceToHost(); - EMfields->Ez_->copyFromDeviceToHost(); - } - double *const __restrict__ ELoc = smpi->dynamics_Epart[ithread].data();//&( smpi->dynamics_Epart[ithread][0] ); - double *const __restrict__ BLoc = smpi->dynamics_Bpart[ithread].data();//&( smpi->dynamics_Bpart[ithread][0] ); -*/ - } - // to be deleted - { - const int nparts = particles.numberOfParticles(); - double *const __restrict__ ELoc = smpi->dynamics_Epart[ithread].data();//&( smpi->dynamics_Epart[ithread][0] ); - double *const __restrict__ BLoc = smpi->dynamics_Bpart[ithread].data();//&( smpi->dynamics_Bpart[ithread][0] ); - std::cout<< std::setprecision (15)<<"print in interpolator fields wrapper eloc before CopyDeviceToHost"<dynamics_Epart[ithread] )[0*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Epart[ithread] )[1*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Epart[ithread] )[2*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[0*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[1*nparts] ), nparts ); - smilei::tools::gpu::HostDeviceMemoryManagement::CopyDeviceToHost( &( ( smpi->dynamics_Bpart[ithread] )[2*nparts] ), nparts ); - - } - std::cout<<"print in interpolator fields wrapper eloc after CopyDeviceToHost"<( idx_p[0] ); + delta = xpn - static_cast( idx_p[0] ); delta2 = delta * delta; // pow( delta_p[0], 2 ); // square of the normalized distance to the central node coeffxp[0] = 0.5 * ( delta2 - delta_p[0] + 0.25 ); diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index 16941b152..693f87dab 100755 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -743,8 +743,7 @@ namespace detail { static_cast( particle_container.getPtrPosition( 0 ) ) ) ); const auto last = first + particle_container.deviceSize(); int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); - printf ( "CellStartingGlobalIndex_for_x %d res %f patch size %d \n",CellStartingGlobalIndex_for_x,parameters.res_space[0], parameters.patch_size_[0] ); - doComputeParticleClusterKey( first, last, + doComputeParticleClusterKey( first, last, Cluster1D{ parameters.res_space[0], parameters.patch_size_[0], CellStartingGlobalIndex_for_x} ); @@ -762,7 +761,7 @@ namespace detail { const auto last = first + particle_container.deviceSize(); int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); - doComputeParticleClusterKey( first, last, + doComputeParticleClusterKey( first, last, Cluster2D{ parameters.res_space[0], parameters.res_space[1], parameters.patch_size_[0], @@ -785,7 +784,7 @@ namespace detail { int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); int CellStartingGlobalIndex_for_z = a_parent_patch.getCellStartingGlobalIndex_noGC(2); - doComputeParticleClusterKey( first, last, + doComputeParticleClusterKey( first, last, Cluster3D{ parameters.res_space[0], parameters.res_space[1], parameters.res_space[2], @@ -971,7 +970,6 @@ namespace detail { // TODO(Etienne M): Find a better way to dispatch at runtime. This is // complex to read and to maintain. int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); - printf("CellStartingGlobalIndex_for_x %d \n" , CellStartingGlobalIndex_for_x ); const Cluster1D cluster_manipulator{ parameters.res_space[0], parameters.patch_size_[0], @@ -1035,7 +1033,6 @@ namespace detail { int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); - printf("CellStartingGlobalIndex_for_x %d \n" , CellStartingGlobalIndex_for_x ); const Cluster2D cluster_manipulator{ parameters.res_space[0], parameters.res_space[1], parameters.patch_size_[0], @@ -1482,7 +1479,6 @@ void nvidiaParticles::initializeDataOnDevice() // setHostBinIndex(); } else { - printf( " parent patch %p cells starting global index %d \n", parent_patch_, parent_patch_->getCellStartingGlobalIndex_noGC(0) ); // At this point, a copy of the host particles and last_index is on the // device and we know we support the space dimension. detail::Cluster::computeParticleClusterKey( *this, *parameters_, *parent_patch_ ); @@ -1956,9 +1952,6 @@ extern "C" { void* CreateGPUParticles( const void* parameters, const void* a_parent_patch ) { - const Patch *temp = static_cast( a_parent_patch ); - - printf( " in create GPU parent patch %p cells starting global index %d \n", a_parent_patch, temp->getCellStartingGlobalIndex_noGC(0) ); return new nvidiaParticles{ *static_cast( parameters ), *static_cast( a_parent_patch ) }; } diff --git a/src/Projector/Projector1D2OrderGPU.cpp b/src/Projector/Projector1D2OrderGPU.cpp index 79d879024..c63223885 100755 --- a/src/Projector/Projector1D2OrderGPU.cpp +++ b/src/Projector/Projector1D2OrderGPU.cpp @@ -195,14 +195,11 @@ void Projector1D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, int ispec, int icell, int ipart_ref ) -{ { std::vector &iold = smpi->dynamics_iold[ithread]; std::vector &delta = smpi->dynamics_deltaold[ithread]; std::vector &invgf = smpi->dynamics_invgf[ithread]; - EMfields->rho_->copyFromDeviceToHost(); - EMfields->rho_s[ispec]->copyFromDeviceToHost(); if( diag_flag ) { double *const __restrict__ b_Jx = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->data() : EMfields->Jx_->data(); @@ -220,20 +217,6 @@ void Projector1D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, // Does not compute Rho ! #if defined( SMILEI_ACCELERATOR_MODE ) - /*currentsAndDensity( b_Jx, b_Jy, b_Jz, b_rho, - Jx_size, Jy_size, Jz_size, rho_size, - particles, x_dimension_bin_count_, - invgf.data(), iold.data(), delta.data(), - inv_cell_volume, - dx_inv_, - dx_ov_dt_, - i_domain_begin_, - not_spectral_ );*/ - // to be deleted - std::cout<<"in projector1D2orderGPUKernel.cpp l229: rho_size= "<rho_->copyFromDeviceToHost(); - EMfields->rho_s[ispec]->copyFromDeviceToHost(); - EMfields->Jx_->copyFromDeviceToHost(); - EMfields->Jx_s[ispec]->copyFromDeviceToHost(); - EMfields->Jy_->copyFromDeviceToHost(); - EMfields->Jy_s[ispec]->copyFromDeviceToHost(); - EMfields->Jz_->copyFromDeviceToHost(); - EMfields->Jz_s[ispec]->copyFromDeviceToHost(); - std::cout<<"in projector1D2orderGPUKernel.cpp l251 after projection: rho_size= "<Jx_->data(); - Jy_ = EMfields->Jy_->data(); - Jz_ = EMfields->Jz_->data(); - rho_ = EMfields->rho_->data(); - - /*currents( Jx_, Jy_, Jz_, - EMfields->Jx_->size(), EMfields->Jy_->size(), EMfields->Jz_->size(), - particles, x_dimension_bin_count_, y_dimension_bin_count_, - invgf.data(), iold.data(), delta.data(), - inv_cell_volume, - dx_inv_, dy_inv_, - dx_ov_dt_, dy_ov_dt_, - i_domain_begin_, j_domain_begin_, - nprimy, - one_third, - not_spectral_ ); - } - double *const __restrict__ b_Jx = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->data() : EMfields->Jx_->data(); - unsigned int Jx_size = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->size() : EMfields->Jx_->size(); - - double *const __restrict__ b_Jy = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->data() : EMfields->Jy_->data(); - unsigned int Jy_size = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->size() : EMfields->Jy_->size(); - - double *const __restrict__ b_Jz = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->data() : EMfields->Jz_->data(); - unsigned int Jz_size = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->size() : EMfields->Jz_->size();//*/ - /*Jx_ = EMfields->Jx_->data(); - Jy_ = EMfields->Jy_->data(); - Jz_ = EMfields->Jz_->data();*/ - - /*currents( Jx_, Jy_, Jz_, - EMfields->Jx_->size(), EMfields->Jy_->size(), EMfields->Jz_->size(), - particles, x_dimension_bin_count_, - invgf.data(), iold.data(), delta.data(), - inv_cell_volume, - dx_inv_, - dx_ov_dt_, - i_domain_begin_, - not_spectral_ );*/ #if defined( SMILEI_ACCELERATOR_MODE ) - //double *device_Jx = smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( b_Jx ) ; - //printf("testing device Jx:, %p \n", device_Jx); - currentDepositionKernel1DOnDevice(Jx_, Jy_, Jz_, //b_Jx,b_Jy,b_Jz, - //Jx_size, Jy_size, Jz_size, + currentDepositionKernel1DOnDevice(Jx_, Jy_, Jz_, EMfields->Jx_->size(), EMfields->Jy_->size(), EMfields->Jz_->size(), particles.getPtrPosition( 0 ), particles.getPtrMomentum( 1 ), @@ -339,27 +269,6 @@ void Projector1D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, } } } -// to be deleted -{ - double *const __restrict__ b_Jx = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->data() : EMfields->Jx_->data(); - unsigned int Jx_size = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->size() : EMfields->Jx_->size(); - - double *const __restrict__ b_Jy = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->data() : EMfields->Jy_->data(); - unsigned int Jy_size = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->size() : EMfields->Jy_->size(); - - double *const __restrict__ b_Jz = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->data() : EMfields->Jz_->data(); - unsigned int Jz_size = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->size() : EMfields->Jz_->size(); - - double *const __restrict__ b_rho = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->data() : EMfields->rho_->data(); - unsigned int rho_size = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->size() : EMfields->rho_->size(); - - std::cout<<"in projector1D2orderGPUKernel.cpp l336: rho_size= "<rho_s[ispec] ? " - << EMfields->rho_s[ispec] << " Jx_size " << Jx_size<< " Jy_size " << Jy_size<< " Jz_size " << Jz_size<< std::endl; - for( int ipart=0 ; ipart( Jx_scratch_space[scratch_space_index] ) ); atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + not_spectral_ * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); // We handle the FTDT/picsar @@ -731,8 +715,6 @@ namespace cudahip1d { const unsigned int first_particle = workgroup_dedicated_bin_index == 0 ? 0 : device_bin_index[workgroup_dedicated_bin_index - 1]; const unsigned int last_particle = device_bin_index[workgroup_dedicated_bin_index]; - //printf(" first_particle %d last_particle %d loopstride %d \n",first_particle, last_particle, loop_stride); - for( unsigned int particle_index = first_particle + thread_index_offset; particle_index < last_particle; particle_index += loop_stride ) { @@ -740,10 +722,6 @@ namespace cudahip1d { const int *const __restrict__ iold = &device_iold_[particle_index]; const double *const __restrict__ deltaold = &device_deltaold_[particle_index]; - //printf("in projector cuda l735: particle charge= %f weight %f position_x= %f, momentum y = %f, momentum z = %f, charge*sqrt(2) %+4.15e \n", static_cast( device_particle_charge[particle_index]) , static_cast( device_particle_weight[particle_index]), - // static_cast( device_particle_position_x[particle_index] ), static_cast( device_particle_momentum_y[particle_index] ), - // static_cast( device_particle_momentum_z[particle_index] ), static_cast( device_particle_charge[particle_index]) * static_cast(sqrt(2.0))); - ComputeFloat Sx0[5]; ComputeFloat Sx1[5]; @@ -954,12 +932,6 @@ namespace cudahip1d { checkHIPErrors( ::hipDeviceSynchronize() ); #elif defined ( __NVCC__ ) - //double *device_Jx = smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ) ; - //printf("testing device Jx:, %p \n", device_Jx); - /*for (int i=0; i( kWorkgroupSize ), 1, 1 }; - //printf("ClusterWidth %d clusterGhostCellBorderWidth %d x_dimension_bin_count %d \n",Params::getGPUClusterWidth( 1), Params::getGPUClusterGhostCellBorderWidth( 2), x_dimension_bin_count); - // NOTE: On cards lacking hardware backed Binary64 atomic operations, // falling back to Binary32 (supposing hardware support for atomic // operations) can lead to drastic performance improvement. @@ -1064,9 +1034,6 @@ namespace cudahip1d { checkHIPErrors( ::hipDeviceSynchronize() ); #elif defined ( __NVCC__ ) - //printf("device bin index in projector cuda: %d \n",*host_bin_index); - //for(int i=0; i<*host_bin_index;++i) - // std::cout<<"in projector cuda, device_particle_position_x[i]"<< device_particle_position_x[i]< Date: Thu, 25 Apr 2024 10:05:25 +0200 Subject: [PATCH 13/54] add smilei_omp_threads in namelist --- .../gpu/tst3d_gpu_o2_thermal_plasma_medium.py | 2 +- .../gpu/tst3d_gpu_o2_thermal_plasma_short.py | 1 - .../gpu/tst3d_v_o2_thermal_plasma_medium.py | 2 +- .../gpu/tst3d_v_o2_thermal_plasma_short.py | 1 - benchmarks/tst2d_18_em_pml.py | 1 - benchmarks/tst2d_s_o4_laser_wake_vay.py | 2 -- benchmarks/tst2d_s_o4_radiation_pressure_acc.py | 2 -- .../tst2d_tasks_01_radiation_pressure_acc.py | 1 - benchmarks/tst2d_v_o2_em_propagation.py | 2 -- benchmarks/tst2d_v_o4_em_propagation.py | 2 -- benchmarks/tst2d_v_o4_laser_wake_vay.py | 2 -- .../tst2d_v_o4_multiphoton_Breit_Wheeler.py | 3 --- benchmarks/tst2d_v_o4_radiation_pressure_acc.py | 2 -- benchmarks/tst3d_s_o4_em_propagation.py | 2 -- doc/Sphinx/Overview/releases.rst | 11 ++++++----- doc/Sphinx/Use/namelist.rst | 17 ++++++++++++----- src/Params/Params.cpp | 14 +++++++++----- src/Python/pyinit.py | 3 ++- 18 files changed, 31 insertions(+), 39 deletions(-) diff --git a/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_medium.py b/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_medium.py index a50614236..cb8c8f26a 100644 --- a/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_medium.py +++ b/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_medium.py @@ -66,7 +66,7 @@ def InitialChargeDensity(x, y, z): number_of_patches = kPatchPerGridDimension, EM_boundary_conditions = [ ["periodic"] ], print_every = 10, - random_seed = smilei_mpi_rank) + ) Vectorization(mode = "off") diff --git a/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_short.py b/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_short.py index 548746977..a627232f9 100644 --- a/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_short.py +++ b/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_short.py @@ -61,7 +61,6 @@ gpu_computing = True, # random_seed = 0xDEADBEEF, - random_seed = smilei_mpi_rank, ) Vectorization( diff --git a/benchmarks/gpu/tst3d_v_o2_thermal_plasma_medium.py b/benchmarks/gpu/tst3d_v_o2_thermal_plasma_medium.py index 524f564d0..92c598c19 100644 --- a/benchmarks/gpu/tst3d_v_o2_thermal_plasma_medium.py +++ b/benchmarks/gpu/tst3d_v_o2_thermal_plasma_medium.py @@ -67,7 +67,7 @@ def InitialChargeDensity(x, y, z): number_of_patches = kPatchPerGridDimension, EM_boundary_conditions = [ ["periodic"] ], print_every = 10, - random_seed = smilei_mpi_rank) + ) Vectorization(mode = "on") diff --git a/benchmarks/gpu/tst3d_v_o2_thermal_plasma_short.py b/benchmarks/gpu/tst3d_v_o2_thermal_plasma_short.py index 3672fd9d0..bc553aa62 100644 --- a/benchmarks/gpu/tst3d_v_o2_thermal_plasma_short.py +++ b/benchmarks/gpu/tst3d_v_o2_thermal_plasma_short.py @@ -61,7 +61,6 @@ gpu_computing = False, # random_seed = 0xDEADBEEF, - random_seed = smilei_mpi_rank, ) Vectorization( diff --git a/benchmarks/tst2d_18_em_pml.py b/benchmarks/tst2d_18_em_pml.py index 18dc5ee09..703fe50fc 100755 --- a/benchmarks/tst2d_18_em_pml.py +++ b/benchmarks/tst2d_18_em_pml.py @@ -25,7 +25,6 @@ ['PML','PML'], ], number_of_pml_cells = [[10,10],[10,10]], - random_seed = smilei_mpi_rank ) Antenna( diff --git a/benchmarks/tst2d_s_o4_laser_wake_vay.py b/benchmarks/tst2d_s_o4_laser_wake_vay.py index 4a4725b51..bb75f29e6 100644 --- a/benchmarks/tst2d_s_o4_laser_wake_vay.py +++ b/benchmarks/tst2d_s_o4_laser_wake_vay.py @@ -28,8 +28,6 @@ solve_poisson = False, print_every = 100, - - random_seed = smilei_mpi_rank ) MovingWindow( diff --git a/benchmarks/tst2d_s_o4_radiation_pressure_acc.py b/benchmarks/tst2d_s_o4_radiation_pressure_acc.py index 7adbc8844..755cbf763 100755 --- a/benchmarks/tst2d_s_o4_radiation_pressure_acc.py +++ b/benchmarks/tst2d_s_o4_radiation_pressure_acc.py @@ -28,8 +28,6 @@ ['silver-muller'], ['periodic'], ], - - random_seed = smilei_mpi_rank ) Vectorization( diff --git a/benchmarks/tst2d_tasks_01_radiation_pressure_acc.py b/benchmarks/tst2d_tasks_01_radiation_pressure_acc.py index 9d4c9af87..6a8530bf0 100644 --- a/benchmarks/tst2d_tasks_01_radiation_pressure_acc.py +++ b/benchmarks/tst2d_tasks_01_radiation_pressure_acc.py @@ -39,7 +39,6 @@ ['periodic'], ], cluster_width = 16, - random_seed = smilei_mpi_rank ) diff --git a/benchmarks/tst2d_v_o2_em_propagation.py b/benchmarks/tst2d_v_o2_em_propagation.py index cc152fa80..76741f1bd 100644 --- a/benchmarks/tst2d_v_o2_em_propagation.py +++ b/benchmarks/tst2d_v_o2_em_propagation.py @@ -32,8 +32,6 @@ ], EM_boundary_conditions_k = [[cos(ang), sin(ang)],[-1.,0.],[0.,1.],[0.,-1.]], - - random_seed = smilei_mpi_rank ) Vectorization( diff --git a/benchmarks/tst2d_v_o4_em_propagation.py b/benchmarks/tst2d_v_o4_em_propagation.py index c6ed8b064..2e15305b0 100644 --- a/benchmarks/tst2d_v_o4_em_propagation.py +++ b/benchmarks/tst2d_v_o4_em_propagation.py @@ -32,8 +32,6 @@ ], EM_boundary_conditions_k = [[cos(ang), sin(ang)],[-1.,0.],[0.,1.],[0.,-1.]], - - random_seed = smilei_mpi_rank ) Vectorization( diff --git a/benchmarks/tst2d_v_o4_laser_wake_vay.py b/benchmarks/tst2d_v_o4_laser_wake_vay.py index 969263838..9e56337d0 100644 --- a/benchmarks/tst2d_v_o4_laser_wake_vay.py +++ b/benchmarks/tst2d_v_o4_laser_wake_vay.py @@ -28,8 +28,6 @@ solve_poisson = False, print_every = 100, - - random_seed = smilei_mpi_rank ) Vectorization( diff --git a/benchmarks/tst2d_v_o4_multiphoton_Breit_Wheeler.py b/benchmarks/tst2d_v_o4_multiphoton_Breit_Wheeler.py index 7db269c5e..91907aee9 100755 --- a/benchmarks/tst2d_v_o4_multiphoton_Breit_Wheeler.py +++ b/benchmarks/tst2d_v_o4_multiphoton_Breit_Wheeler.py @@ -95,9 +95,6 @@ def n0_positron(x,y): simulation_time = Tsim, EM_boundary_conditions = [field_cond, field_cond], - - random_seed = smilei_mpi_rank, - reference_angular_frequency_SI = wr ) diff --git a/benchmarks/tst2d_v_o4_radiation_pressure_acc.py b/benchmarks/tst2d_v_o4_radiation_pressure_acc.py index c0feb79d2..353418604 100644 --- a/benchmarks/tst2d_v_o4_radiation_pressure_acc.py +++ b/benchmarks/tst2d_v_o4_radiation_pressure_acc.py @@ -28,8 +28,6 @@ ['silver-muller'], ['periodic'], ], - - random_seed = smilei_mpi_rank ) diff --git a/benchmarks/tst3d_s_o4_em_propagation.py b/benchmarks/tst3d_s_o4_em_propagation.py index 6a87f2dfa..ef97569d5 100755 --- a/benchmarks/tst3d_s_o4_em_propagation.py +++ b/benchmarks/tst3d_s_o4_em_propagation.py @@ -21,8 +21,6 @@ simulation_time = Tsim, EM_boundary_conditions = [ ['silver-muller'] ], - - random_seed = smilei_mpi_rank ) LaserGaussian3D( diff --git a/doc/Sphinx/Overview/releases.rst b/doc/Sphinx/Overview/releases.rst index 8027d2f3d..5c3e9d046 100755 --- a/doc/Sphinx/Overview/releases.rst +++ b/doc/Sphinx/Overview/releases.rst @@ -30,6 +30,12 @@ Changes made in the repository (not released) * Features: * Relativistic field initialization now supports multiple species and both direction propagations. + * Added the argument ``phase_offset`` in laser definitions such as ``LaserGaussian2D``. + * The ``LaserGaussianAM`` definition will only use one coordinate for its ``focus`` argument + (the transverse coordinate of the focus in this geometry is zero). + * Small improvements in PML for envelope model (AM and 2D). + * Deprecated ``smilei_rand_max``. + * New namelist variables ``smilei_omp_threads`` and ``smilei_total_cores``. * Happi: @@ -44,11 +50,6 @@ Changes made in the repository (not released) * Dark theme (click the switch on the bottom left, or set browser preferences). -* Added the argument ``phase_offset`` in laser definitions such as ``LaserGaussian2D``. -* The ``LaserGaussianAM`` definition will only use one coordinate for its ``focus`` argument - (the transverse coordinate of the focus in this geometry is zero). -* Small improvements in PML for envelope model (AM and 2D). - * Bug fixes: * ``dump_minutes`` often failed to write some checkpoint files. diff --git a/doc/Sphinx/Use/namelist.rst b/doc/Sphinx/Use/namelist.rst index ad318954c..f7deebcae 100755 --- a/doc/Sphinx/Use/namelist.rst +++ b/doc/Sphinx/Use/namelist.rst @@ -60,7 +60,8 @@ for each MPI process). The following steps are executed: * The rank of the current MPI process as :py:data:`smilei_mpi_rank`. * The total number of MPI processes as :py:data:`smilei_mpi_size`. - * The maximum random integer as :py:data:`smilei_rand_max`. + * The number of OpenMP threads per MPI :py:data:`smilei_omp_threads`. + * The total number of cores :py:data:`smilei_total_cores`. #. The namelist(s) is executed. @@ -3619,9 +3620,15 @@ namelist. They should not be re-defined by the user! The total number of MPI processes. -.. - <> - .. py:data:: smilei_rand_max +.. py:data:: smilei_omp_threads + + The number of OpenMP threads per MPI. + +.. py:data:: smilei_total_cores - The largest random integer. + The total number of cores. +.. note:: + + These variables can be access during ``happi`` post-processing, e.g. + ``S.namelist.smilei_mpi_size``. \ No newline at end of file diff --git a/src/Params/Params.cpp b/src/Params/Params.cpp index bc9fb8ed4..803cdf9e5 100755 --- a/src/Params/Params.cpp +++ b/src/Params/Params.cpp @@ -129,16 +129,20 @@ Params::Params( SmileiMPI *smpi, std::vector namelistsFiles ) : PyObject_SetAttrString( Py_main, "_test_mode", Py_False ); PyTools::checkPyError(); - // here we add the rank, in case some script need it + // we add the rank, in case some script needs it PyModule_AddIntConstant( Py_main, "smilei_mpi_rank", smpi->getRank() ); - // here we add the MPI size, in case some script need it + // we add the MPI size, in case some script needs it PyModule_AddIntConstant( Py_main, "smilei_mpi_size", smpi->getSize() ); namelist += string( "smilei_mpi_size = " ) + to_string( smpi->getSize() ) + "\n"; - // here we add the larget int, important to get a valid seed for randomization - PyModule_AddIntConstant( Py_main, "smilei_rand_max", RAND_MAX ); - namelist += string( "smilei_rand_max = " ) + to_string( RAND_MAX ) + "\n\n"; + // we add the openMP size, in case some script needs it + PyModule_AddIntConstant( Py_main, "smilei_omp_threads", smpi->getOMPMaxThreads() ); + namelist += string( "smilei_omp_threads = " ) + to_string( smpi->getOMPMaxThreads() ) + "\n"; + + // we add the total number of cores, in case some script needs it + PyModule_AddIntConstant( Py_main, "smilei_total_cores", smpi->getGlobalNumCores() ); + namelist += string( "smilei_total_cores = " ) + to_string( smpi->getGlobalNumCores() ) + "\n"; // Running pyprofiles.py runScript( string( reinterpret_cast( pyprofiles_py ), pyprofiles_py_len ), "pyprofiles.py", globals ); diff --git a/src/Python/pyinit.py b/src/Python/pyinit.py index 56febc475..f5aeeb7e1 100755 --- a/src/Python/pyinit.py +++ b/src/Python/pyinit.py @@ -645,7 +645,8 @@ class MultiphotonBreitWheeler(SmileiComponent): # Smilei-defined smilei_mpi_rank = 0 smilei_mpi_size = 1 -smilei_rand_max = 2**31-1 +smilei_omp_threads = 1 +smilei_total_cores = 1 # Variable to set to False for the actual run (useful for the test mode) _test_mode = True From 348faa03cf42084839984b61870942c6a9af05d1 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Fri, 26 Apr 2024 00:08:51 +0200 Subject: [PATCH 14/54] fix particle exchange --- src/Particles/Particles.cpp | 4 ++-- src/Particles/Particles.h | 4 ++-- src/Particles/nvidiaParticles.cu | 14 ++++++++++---- src/Particles/nvidiaParticles.h | 2 +- src/Patch/Patch.cpp | 4 ++-- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/Particles/Particles.cpp b/src/Particles/Particles.cpp index 688c53085..34eaeb161 100755 --- a/src/Particles/Particles.cpp +++ b/src/Particles/Particles.cpp @@ -1299,13 +1299,13 @@ void Particles::copyFromHostToDevice() { ERROR( "Device only feature, should not have come here!" ); } -void Particles::copyFromDeviceToHost() +void Particles::copyFromDeviceToHost( bool ) { ERROR( "Device only feature, should not have come here!" ); } // Loop all particles and copy the outgoing ones to buffers -void Particles::copyLeavingParticlesToBuffers( const bool copy[], Particles* buffer[] ) +void Particles::copyLeavingParticlesToBuffers( const vector copy, const vector buffer ) { // Leaving particles have a cell_key equal to -2-direction // where direction goes from 0 to 6 and tells which way the particle escapes. diff --git a/src/Particles/Particles.h b/src/Particles/Particles.h index 86f9f9cac..c0e5958e3 100755 --- a/src/Particles/Particles.h +++ b/src/Particles/Particles.h @@ -435,7 +435,7 @@ class Particles virtual void initializeDataOnDevice(); virtual void initializeIDsOnDevice(); virtual void copyFromHostToDevice(); - virtual void copyFromDeviceToHost(); + virtual void copyFromDeviceToHost( bool copy_keys = false ); //! Return the pointer toward the Position[idim] vector virtual double* getPtrPosition( int idim ) { @@ -475,7 +475,7 @@ class Particles // ----------------------------------------------------------------------------- //! Extract particles leaving the box to buffers // ----------------------------------------------------------------------------- - void copyLeavingParticlesToBuffers( const bool copy[], Particles* buffer[] ); + void copyLeavingParticlesToBuffers( const std::vector copy, const std::vector buffer ); virtual void copyLeavingParticlesToBuffer( Particles* buffer ); // ----------------------------------------------------------------------------- diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index efca22ad5..af45bfadd 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -1348,7 +1348,7 @@ void nvidiaParticles::copyFromHostToDevice() // ------------------------------------------------------------------------------------------------- //! Copy device to host // ------------------------------------------------------------------------------------------------- -void nvidiaParticles::copyFromDeviceToHost() +void nvidiaParticles::copyFromDeviceToHost( bool copy_keys ) { for (int idim=0;idimcopyFromDeviceToHost(); + buffer->copyFromDeviceToHost( true ); } @@ -1410,7 +1414,8 @@ void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pre nvidia_momentum_[1].begin(), nvidia_momentum_[2].begin(), nvidia_weight_.begin(), - nvidia_charge_.begin() ) ); + nvidia_charge_.begin(), + nvidia_cell_keys_.begin() ) ); const auto source_iterator_last = source_iterator_first + nparts; // std::advance nvidiaParticles* const cp_parts = static_cast( buffer ); @@ -1428,7 +1433,8 @@ void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pre cp_parts->nvidia_momentum_[1].begin(), cp_parts->nvidia_momentum_[2].begin(), cp_parts->nvidia_weight_.begin(), - cp_parts->nvidia_charge_.begin() ) ); + cp_parts->nvidia_charge_.begin(), + cp_parts->nvidia_cell_keys_.begin() ) ); // Copy send particles in dedicated data structure thrust::copy_if( thrust::device, diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index ba689f1e8..5fa0a933b 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -78,7 +78,7 @@ class nvidiaParticles : public Particles void copyFromHostToDevice() override; //! Update the particles from device to host - void copyFromDeviceToHost() override; + void copyFromDeviceToHost( bool copy_keys = false ) override; unsigned int deviceCapacity() const override; diff --git a/src/Patch/Patch.cpp b/src/Patch/Patch.cpp index 585f76f97..8fa4022aa 100755 --- a/src/Patch/Patch.cpp +++ b/src/Patch/Patch.cpp @@ -540,8 +540,8 @@ void Patch::copyExchParticlesToBuffers( int ispec, Params ¶ms ) cleanMPIBuffers( ispec, params ); // Make a list of buffers - bool copy[params.nDim_field*2]; - Particles* sendBuffer[params.nDim_field*2]; + vector copy( params.nDim_field*2, false ); + vector sendBuffer( params.nDim_field*2, nullptr ); for( size_t iDim = 0; iDim < params.nDim_field; iDim++ ) { copy[2*iDim+0] = neighbor_[iDim][0] != MPI_PROC_NULL; copy[2*iDim+1] = neighbor_[iDim][1] != MPI_PROC_NULL; From 4b2f6487debd5d5290f29ef9a4583981ccd19cf2 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Fri, 26 Apr 2024 00:12:42 +0200 Subject: [PATCH 15/54] make happi working with virtualenv --- makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/makefile b/makefile index 3aaff0201..4c9ada85b 100755 --- a/makefile +++ b/makefile @@ -52,7 +52,7 @@ DIRS := $(shell find src -type d) SRCS := $(shell find src/* -name \*.cpp) OBJS := $(addprefix $(BUILD_DIR)/, $(SRCS:.cpp=.o)) DEPS := $(addprefix $(BUILD_DIR)/, $(SRCS:.cpp=.d)) -SITEDIR = $(shell $(PYTHONEXE) -c 'import site; site._script()' --user-site) +SITEDIR = $(shell d=`$(PYTHONEXE) -m site --user-site` && echo $$d || $(PYTHONEXE) -c "import sysconfig; print(sysconfig.get_path('purelib'))") # Smilei tools TABLES_DIR := tools/tables From 8bcaeb4790678eb695a143d10d9ff3956269d820 Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Mon, 29 Apr 2024 06:54:32 +0200 Subject: [PATCH 16/54] add publication --- doc/Sphinx/Overview/material.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/Sphinx/Overview/material.rst b/doc/Sphinx/Overview/material.rst index 3322c2857..048d8e1b5 100644 --- a/doc/Sphinx/Overview/material.rst +++ b/doc/Sphinx/Overview/material.rst @@ -30,7 +30,7 @@ Papers involving Smilei ^^^^^^^^^^^^^^^^^^^^^^^^ Only papers published in peer-reviewed journals are listed (for the complete list of citing papers see `Google Scholar `_). -As of April 2024, 181 papers have been published covering a broad range of topics: +As of April 2024, 182 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -50,6 +50,12 @@ Following is the distribution of these topics in the listed publications up to N Use the python script doc/doi2publications.py to generate entries from a DOI number, and paste them here You can count the number of papers in the list with the vim command :%s/.. \[//gn. +.. [Yao2024] + + W. Yao, M. Nakatsutsumi, S. Buffechoux, P. Antici, M. Borghesi, A. Ciardi, S. N. Chen, E. d’Humières, L. Gremillet, R. Heathcote, V. Horný, P. McKenna, M. N. Quinn, L. Romagnani, R. Royle, G. Sarri, Y. Sentoku, H.-P. Schlenvoigt, T. Toncian, O. Tresca, L. Vassura, O. Willi, J. Fuchs, + `Optimizing laser coupling, matter heating, and particle acceleration from solids using multiplexed ultraintense lasers`, + `Matter and Radiation at Extremes 9, 047202 (2024) `_ + .. [Luo2024] M. Luo, C. Riconda, I. Pusztai, A. Grassi, J. S. Wurtele, and T. Fülöp, From 152c4bee0ba2cbabdd72720d9ece45b1e3956139 Mon Sep 17 00:00:00 2001 From: cprouveur Date: Mon, 29 Apr 2024 15:31:27 +0200 Subject: [PATCH 17/54] Fix in coef found on adastra thanks to a different compiler behaviour compared to nvc++ --- src/Interpolator/Interpolator1D2Order.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/Interpolator/Interpolator1D2Order.h b/src/Interpolator/Interpolator1D2Order.h index c55af0222..9a1b2a9e4 100755 --- a/src/Interpolator/Interpolator1D2Order.h +++ b/src/Interpolator/Interpolator1D2Order.h @@ -89,7 +89,6 @@ class Interpolator1D2Order final : public Interpolator1D idx_p[0] = std::round( xpn ); idx_d[0] = std::round( xpn + 0.5 ); - delta = xpn - static_cast( idx_d[0] ) + 0.5; // normalized distance to the central node delta2 = delta * delta; // square of the normalized distance to the central node @@ -97,17 +96,14 @@ class Interpolator1D2Order final : public Interpolator1D coeffxd[1] = ( 0.75 - delta2 ); coeffxd[2] = 0.5 * ( delta2 + delta + 0.25 ); - - delta = xpn - static_cast( idx_p[0] ); + delta = xpn - static_cast( idx_p[0] ); delta2 = delta * delta; // pow( delta_p[0], 2 ); // square of the normalized distance to the central node - + + delta_p[0] = delta; // normalized distance to the central node coeffxp[0] = 0.5 * ( delta2 - delta_p[0] + 0.25 ); coeffxp[1] = ( 0.75 - delta2 ); coeffxp[2] = 0.5 * ( delta2 + delta_p[0] + 0.25 ); - delta_p[0] = delta; // normalized distance to the central node - - idx_p[0] = idx_p[0] - i_domain_begin_; idx_d[0] = idx_d[0] - i_domain_begin_; From 27dd743d2f95a8e7c8db4acdb4d835809dd9c1b0 Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Thu, 2 May 2024 13:45:00 +0200 Subject: [PATCH 18/54] add publication --- doc/Sphinx/Overview/material.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/Sphinx/Overview/material.rst b/doc/Sphinx/Overview/material.rst index 048d8e1b5..9e6e17daf 100644 --- a/doc/Sphinx/Overview/material.rst +++ b/doc/Sphinx/Overview/material.rst @@ -30,7 +30,7 @@ Papers involving Smilei ^^^^^^^^^^^^^^^^^^^^^^^^ Only papers published in peer-reviewed journals are listed (for the complete list of citing papers see `Google Scholar `_). -As of April 2024, 182 papers have been published covering a broad range of topics: +As of April 2024, 183 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -50,6 +50,12 @@ Following is the distribution of these topics in the listed publications up to N Use the python script doc/doi2publications.py to generate entries from a DOI number, and paste them here You can count the number of papers in the list with the vim command :%s/.. \[//gn. +.. [Pan2024] + + Z. Pan, J. Liu, P. Wang, Z. Mei, Z. Cao, D. Kong, S. Xu, Z. Liu, Y. Liang, Z. Peng, T. Xu, T. Song, X. Chen, Q. Wu, Y. Zhang, Q. Han, H. Chen, J. Zhao, Y. Gao, S. Chen, Y. Zhao, X. Yan, Y. Shou, W. Ma, + `Electron acceleration and x-ray generation from near-critical-density carbon nanotube foams driven by moderately relativistic lasers`, + `Physics of Plasmas 31, 043108 (2024) `_ + .. [Yao2024] W. Yao, M. Nakatsutsumi, S. Buffechoux, P. Antici, M. Borghesi, A. Ciardi, S. N. Chen, E. d’Humières, L. Gremillet, R. Heathcote, V. Horný, P. McKenna, M. N. Quinn, L. Romagnani, R. Royle, G. Sarri, Y. Sentoku, H.-P. Schlenvoigt, T. Toncian, O. Tresca, L. Vassura, O. Willi, J. Fuchs, From 337a1ee153cceb54ba38ae936ff2bf9b48142d9b Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Tue, 7 May 2024 10:35:01 +0200 Subject: [PATCH 19/54] Sort on gpu with thrust::gather --- src/Particles/nvidiaParticles.cu | 141 ++++++++++++++++--------------- src/Particles/nvidiaParticles.h | 26 ++++++ 2 files changed, 98 insertions(+), 69 deletions(-) diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index af45bfadd..5a6524a88 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -15,6 +15,7 @@ #include #include #include +#include #include "Patch.h" @@ -468,10 +469,8 @@ namespace detail { ParticleNoKeyIteratorProvider particle_no_key_iterator_provider ) { const auto first_particle = particle_iterator_provider( particle_container ); - - auto last_particle = first_particle + - particle_container.deviceSize(); // Obviously, we use half open ranges - + auto last_particle = first_particle + particle_container.deviceSize(); + // Remove out of bound particles // Using more memory, we could use the faster remove_copy_if // NOTE: remove_if is stable. @@ -479,82 +478,86 @@ namespace detail { first_particle, last_particle, OutOfBoundaryPredicate{} ); - - // Idea 1: - remove_copy_if instead of copy_if - // - sort(the_particles_to_inject) - // - merge - // - compute bins - // NOTE: This method consumes a lot of memory ! O(N) - + const auto initial_count = std::distance( first_particle, last_particle ); const auto inject_count = particle_to_inject.deviceSize(); const auto new_count = initial_count + inject_count; - + + // Resize particles // NOTE: We really want a non-initializing vector here! // It's possible to give a custom allocator to thrust::device_vector. // Create one with construct(<>) as a noop and derive from // thrust::device_malloc_allocator. For now we do an explicit resize. - particle_to_inject.softReserve( new_count ); - particle_to_inject.resize( new_count ); // We probably invalidated the iterators - - // Copy out of cluster/tile/chunk particles - // partition_copy is way slower than copy_if/remove_copy_if on rocthrust - // https://github.com/ROCmSoftwarePlatform/rocThrust/issues/247 - - const auto first_to_inject = particle_iterator_provider( particle_to_inject ); - const auto first_to_reorder = first_to_inject + inject_count; - - // NOTE: copy_if/remove_copy_if are stable. - // First, copy particles that are not in their own cluster anymore - const auto first_already_ordered = thrust::copy_if( thrust::device, - first_particle, last_particle, - first_to_reorder, - OutOfClusterPredicate{ cluster_type } ); - // Then, copy particles that are still in their own cluster - const auto end = thrust::remove_copy_if( thrust::device, - first_particle, last_particle, - first_already_ordered, - OutOfClusterPredicate{ cluster_type } ); - - // Compute or recompute the cluster index of the particle_to_inject - // NOTE: - // - we can "save" some work here if cluster index is already computed - // for the new particles to inject (not the one we got with copy_if). - // - doComputeParticleClusterKey( first_to_inject, - first_already_ordered, - cluster_type ); - - const auto first_to_inject_no_key = particle_no_key_iterator_provider( particle_to_inject ); - const auto particle_to_rekey_count = std::distance( first_to_inject, - first_already_ordered ); - - doSortParticleByKey( particle_to_inject.getPtrCellKeys(), - particle_to_inject.getPtrCellKeys() + particle_to_rekey_count, - first_to_inject_no_key ); - - // This free generates a lot of memory fragmentation. - // particle_container.free(); - // Same as for particle_to_inject, non-initializing vector is best. particle_container.softReserve( new_count ); particle_container.resize( new_count ); - - // Merge by key - // NOTE: Dont merge in place on GPU. That means we need an other large buffer! - // - thrust::merge_by_key( thrust::device, - particle_to_inject.getPtrCellKeys(), // Input range 1, first key - particle_to_inject.getPtrCellKeys() + particle_to_rekey_count, // Input range 1, last key - particle_to_inject.getPtrCellKeys() + particle_to_rekey_count, // Input range 2, first key - particle_to_inject.getPtrCellKeys() + new_count, // Input range 2, last key - first_to_inject_no_key, // Input range 1, first value - first_to_inject_no_key + particle_to_rekey_count, // Input range 2, first value - particle_container.getPtrCellKeys(), // Output range first key - particle_no_key_iterator_provider( particle_container ) ); // Output range first value - + + // Combine imported particles to main particles + const auto first = particle_no_key_iterator_provider( particle_container ); + const auto first_to_inject = particle_no_key_iterator_provider( particle_to_inject ); + thrust::copy( thrust::device, + first_to_inject, + first_to_inject + inject_count, + first + initial_count ); + + // Compute keys of imported particles + const auto first_new = particle_iterator_provider( particle_container ); + doComputeParticleClusterKey( first_new, first_new + new_count, cluster_type ); + + // Make a sorting map using the cell keys (like numpy.argsort) + thrust::device_vector particle_index( new_count ); + thrust::counting_iterator iter( 0 ); + thrust::copy(iter, iter + new_count, particle_index.begin()); + thrust::sort_by_key( thrust::device, + particle_container.getPtrCellKeys(), + particle_container.getPtrCellKeys() + new_count, + particle_index.begin() ); + + // Make a buffer + thrust::device_vector buffer( new_count ); + + // Sort particles using thrust::gather, according to the sorting map + for( int idim = 0; idim < particle_container.dimension(); idim++ ) { + thrust::gather( thrust::device, + particle_index.begin(), particle_index.end(), + particle_container.getPtrPosition( idim ), + buffer.begin() ); + particle_container.swapPosition( idim, buffer ); + } + for( int idim = 0; idim < 3; idim++ ) { + thrust::gather( thrust::device, + particle_index.begin(), particle_index.end(), + particle_container.getPtrMomentum( idim ), + buffer.begin() ); + particle_container.swapMomentum( idim, buffer ); + } + thrust::gather( thrust::device, + particle_index.begin(), particle_index.end(), + particle_container.getPtrWeight(), + buffer.begin() ); + particle_container.swapWeight( buffer ); + buffer.resize( 0 ); + + thrust::device_vector buffer_short( new_count ); + thrust::gather( thrust::device, + particle_index.begin(), particle_index.end(), + particle_container.getPtrCharge(), + buffer_short.begin() ); + particle_container.swapCharge( buffer_short ); + buffer_short.resize( 0 ); + + if( particle_container.tracked ) { + thrust::device_vector buffer_uint64( new_count ); + thrust::gather( thrust::device, + particle_index.begin(), particle_index.end(), + particle_container.getPtrId(), + buffer_uint64.begin() ); + particle_container.swapId( buffer_uint64 ); + buffer_uint64.resize( 0 ); + } + // Recompute bins computeBinIndex( particle_container ); - + // This free generates a lot of memory fragmentation. If we enable it we // reduce significantly the memory usage over time but a memory spike // will still be present. Unfortunately, this free generates soo much diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index 5fa0a933b..f1ec4ad8b 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -112,6 +112,32 @@ class nvidiaParticles : public Particles return thrust::raw_pointer_cast( nvidia_id_.data() ); }; + void swapPosition( int idim, thrust::device_vector &new_vector ) { + nvidia_position_[idim].swap( new_vector ); + }; + void swapMomentum( int idim, thrust::device_vector &new_vector ) { + nvidia_momentum_[idim].swap( new_vector ); + }; + void swapWeight( thrust::device_vector &new_vector ) { + nvidia_weight_.swap( new_vector ); + }; + void swapChi( thrust::device_vector &new_vector ) { + nvidia_chi_.swap( new_vector ); + }; + void swapCharge( thrust::device_vector &new_vector ) { + nvidia_charge_.swap( new_vector ); + }; + void swapTau( thrust::device_vector &new_vector ) { + nvidia_tau_.swap( new_vector ); + }; + void swapCellKeys( thrust::device_vector &new_vector ) { + nvidia_cell_keys_.swap( new_vector ); + }; + void swapId( thrust::device_vector &new_vector ) { + nvidia_id_.swap( new_vector ); + }; + + // ----------------------------------------------------------------------------- //! Move leaving particles to the buffers // ----------------------------------------------------------------------------- From 8a6b4a82115099c145c39af4d1daeefa812376b9 Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Thu, 9 May 2024 07:42:20 +0200 Subject: [PATCH 20/54] add article --- doc/Sphinx/Overview/material.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/Sphinx/Overview/material.rst b/doc/Sphinx/Overview/material.rst index 9e6e17daf..d0446f1ce 100644 --- a/doc/Sphinx/Overview/material.rst +++ b/doc/Sphinx/Overview/material.rst @@ -30,7 +30,7 @@ Papers involving Smilei ^^^^^^^^^^^^^^^^^^^^^^^^ Only papers published in peer-reviewed journals are listed (for the complete list of citing papers see `Google Scholar `_). -As of April 2024, 183 papers have been published covering a broad range of topics: +As of May 2024, 184 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -50,6 +50,13 @@ Following is the distribution of these topics in the listed publications up to N Use the python script doc/doi2publications.py to generate entries from a DOI number, and paste them here You can count the number of papers in the list with the vim command :%s/.. \[//gn. + +.. [Azamoum2024] + + Y. Azamoum, G. A. Becker, S. Keppler, G. Duchateau, S. Skupin, M. Grech, F. Catoire, S. Hell, I. Tamer, M. Hornung, M. Hellwing, A. Kessler, F. Schorcht, and M. C. Kaluza, + `Optical probing of ultrafast laser-induced solid-to-overdense-plasma transitions`, + `Light: Science & Applications volume 13, Article number: 109 (2024) `_ + .. [Pan2024] Z. Pan, J. Liu, P. Wang, Z. Mei, Z. Cao, D. Kong, S. Xu, Z. Liu, Y. Liang, Z. Peng, T. Xu, T. Song, X. Chen, Q. Wu, Y. Zhang, Q. Han, H. Chen, J. Zhao, Y. Gao, S. Chen, Y. Zhao, X. Yan, Y. Shou, W. Ma, From 8447b754bc834a65cb1ab44c34117c658f324a11 Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Sat, 11 May 2024 12:14:24 +0200 Subject: [PATCH 21/54] add publication --- doc/Sphinx/Overview/material.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/Sphinx/Overview/material.rst b/doc/Sphinx/Overview/material.rst index d0446f1ce..61fa240c7 100644 --- a/doc/Sphinx/Overview/material.rst +++ b/doc/Sphinx/Overview/material.rst @@ -30,7 +30,7 @@ Papers involving Smilei ^^^^^^^^^^^^^^^^^^^^^^^^ Only papers published in peer-reviewed journals are listed (for the complete list of citing papers see `Google Scholar `_). -As of May 2024, 184 papers have been published covering a broad range of topics: +As of May 2024, 185 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -55,7 +55,7 @@ Following is the distribution of these topics in the listed publications up to N Y. Azamoum, G. A. Becker, S. Keppler, G. Duchateau, S. Skupin, M. Grech, F. Catoire, S. Hell, I. Tamer, M. Hornung, M. Hellwing, A. Kessler, F. Schorcht, and M. C. Kaluza, `Optical probing of ultrafast laser-induced solid-to-overdense-plasma transitions`, - `Light: Science & Applications volume 13, Article number: 109 (2024) `_ + `Light: Science & Applications 13, 109 (2024) `_ .. [Pan2024] @@ -146,7 +146,13 @@ Following is the distribution of these topics in the listed publications up to N A. Seidel, B. Lei, C. Zepter, M. C. Kaluza, A. Sävert, M. Zepf, and D. Seipt, `Polarization and CEP dependence of the transverse phase space in laser driven accelerators`, `Physical Review Research 6, 013056 (2024) `_ - + +.. [Krishnamurthy2023] + + S. Krishnamurthy, S. Chintalwad, A. P. L. Robinson, R. M. G. M. Trines, and B. Ramakrishna, + `Observation of proton modulations in laser–solid interaction`, + `Plasma Physics and Controlled Fusion 65 085020 (2023) `_ + .. [Gao2023b] X. Gao, From 47e30b4b2ba663aabd546e389f1fb9d985b38a9b Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Mon, 13 May 2024 22:25:14 +0200 Subject: [PATCH 22/54] add publication --- doc/Sphinx/Overview/material.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/Sphinx/Overview/material.rst b/doc/Sphinx/Overview/material.rst index 61fa240c7..04973edbf 100644 --- a/doc/Sphinx/Overview/material.rst +++ b/doc/Sphinx/Overview/material.rst @@ -30,7 +30,7 @@ Papers involving Smilei ^^^^^^^^^^^^^^^^^^^^^^^^ Only papers published in peer-reviewed journals are listed (for the complete list of citing papers see `Google Scholar `_). -As of May 2024, 185 papers have been published covering a broad range of topics: +As of May 2024, 186 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -51,6 +51,12 @@ Following is the distribution of these topics in the listed publications up to N You can count the number of papers in the list with the vim command :%s/.. \[//gn. +.. [Timmis2024] + + R. J. L. Timmis, R. W. Paddock, I. Ouatu, J. Lee, S. Howard, E. Atonga, R. T. Ruskov, H. Martin, R. H. W. Wang, R. Aboushelbaya, M. W. von der Leyen, E. Gumbrell and P. A. Norreys, + `Attosecond and nano‐Coulomb electron bunches via the Zero Vector Potential mechanism`, + `Scientific Reports volume 14, 10805 (2024) `_ + .. [Azamoum2024] Y. Azamoum, G. A. Becker, S. Keppler, G. Duchateau, S. Skupin, M. Grech, F. Catoire, S. Hell, I. Tamer, M. Hornung, M. Hellwing, A. Kessler, F. Schorcht, and M. C. Kaluza, From 18f1e1c120143504865619c06e937435f43a53b4 Mon Sep 17 00:00:00 2001 From: Arnaud Beck Date: Tue, 14 May 2024 17:25:22 +0200 Subject: [PATCH 23/54] Typo in deprecated error message --- src/Python/pyprofiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Python/pyprofiles.py b/src/Python/pyprofiles.py index 0e122a1a9..2fff14c1f 100755 --- a/src/Python/pyprofiles.py +++ b/src/Python/pyprofiles.py @@ -702,7 +702,7 @@ def LaserGaussianAM( box_side="xmin", a0=1., omega=1., focus=None, waist=3., print("ERROR: focus should be a list of length 1") exit(1) elif (len(focus)==2): - print("WARNING: deprecated focus in LaserEnvelopeGaussianAM should be a list of length 1") + print("WARNING: deprecated focus in LaserGaussianAM should be a list of length 1") # Polarization and amplitude [dephasing, amplitudeY, amplitudeZ] = transformPolarization(polarization_phi, ellipticity) amplitudeY *= a0 * omega From af0070a2315a8952ee8ab4ae8c5b2d67916bdb3c Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Wed, 15 May 2024 11:53:43 +0200 Subject: [PATCH 24/54] try sorting with zip_iterator --- .../compile_tools/machine/jean_zay_gpu_V100 | 17 ++++++- src/Particles/nvidiaParticles.cu | 46 ++++--------------- src/Particles/nvidiaParticles.h | 13 ++++++ 3 files changed, 36 insertions(+), 40 deletions(-) diff --git a/scripts/compile_tools/machine/jean_zay_gpu_V100 b/scripts/compile_tools/machine/jean_zay_gpu_V100 index 7fa7ce513..cc9d15c8b 100644 --- a/scripts/compile_tools/machine/jean_zay_gpu_V100 +++ b/scripts/compile_tools/machine/jean_zay_gpu_V100 @@ -5,12 +5,25 @@ # Documentation: # http://www.idris.fr/jean-zay # +# Use the following commented commands to have the proper environment for compilation and running +# +# module purge +# module load anaconda-py3/2020.11 +# module load nvidia-compilers/23.11 +# module load cuda/12.2.0 +# module load openmpi/4.1.5-cuda +# module load hdf5/1.12.0-mpi-cuda +# export HDF5_ROOT_DIR=/gpfslocalsup/spack_soft/hdf5/1.12.0/nvhpc-23.11-i5lyakq3iu254ru3eqe2yukvg7airopl +# export I_MPI_CXX=pgc++ +# export SMILEICXX=mpic++ +# export CICCFLAG="--c++14" + SMILEICXX_DEPS = g++ #GPU_COMPILER = nvcc CXXFLAGS += -w -CXXFLAGS += -ta=tesla:cc70 -std=c++14 -lcurand -Minfo=accel # what is offloaded/copied +CXXFLAGS += -acc=gpu -gpu=cc70 -std=c++14 -lcurand -Minfo=accel # what is offloaded/copied # CXXFLAGS += -Minfo=all # very verbose output CXXFLAGS += -D__GCC_ATOMIC_TEST_AND_SET_TRUEVAL=1 @@ -18,4 +31,4 @@ CXXFLAGS += -D__GCC_ATOMIC_TEST_AND_SET_TRUEVAL=1 GPU_COMPILER_FLAGS += -O3 --std c++14 -arch=sm_70 GPU_COMPILER_FLAGS += --expt-relaxed-constexpr -LDFLAGS += -ta=tesla:cc70 -std=c++14 -Mcudalib=curand -lcudart -lcurand -lacccuda +LDFLAGS += -acc=gpu -gpu=cc70 -std=c++14 -cudalib=curand -lcudart -lcurand -lacccuda diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index 5a6524a88..85e5a5bf7 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -512,48 +512,18 @@ namespace detail { particle_container.getPtrCellKeys() + new_count, particle_index.begin() ); - // Make a buffer - thrust::device_vector buffer( new_count ); + + particle_to_inject.softReserve( new_count ); + particle_to_inject.resize( new_count ); - // Sort particles using thrust::gather, according to the sorting map - for( int idim = 0; idim < particle_container.dimension(); idim++ ) { - thrust::gather( thrust::device, - particle_index.begin(), particle_index.end(), - particle_container.getPtrPosition( idim ), - buffer.begin() ); - particle_container.swapPosition( idim, buffer ); - } - for( int idim = 0; idim < 3; idim++ ) { - thrust::gather( thrust::device, - particle_index.begin(), particle_index.end(), - particle_container.getPtrMomentum( idim ), - buffer.begin() ); - particle_container.swapMomentum( idim, buffer ); - } - thrust::gather( thrust::device, - particle_index.begin(), particle_index.end(), - particle_container.getPtrWeight(), - buffer.begin() ); - particle_container.swapWeight( buffer ); - buffer.resize( 0 ); - - thrust::device_vector buffer_short( new_count ); + const auto first_unsorted = particle_no_key_iterator_provider( particle_container ); + const auto first_buffer = particle_no_key_iterator_provider( particle_to_inject ); thrust::gather( thrust::device, particle_index.begin(), particle_index.end(), - particle_container.getPtrCharge(), - buffer_short.begin() ); - particle_container.swapCharge( buffer_short ); - buffer_short.resize( 0 ); + first_unsorted, + first_buffer ); - if( particle_container.tracked ) { - thrust::device_vector buffer_uint64( new_count ); - thrust::gather( thrust::device, - particle_index.begin(), particle_index.end(), - particle_container.getPtrId(), - buffer_uint64.begin() ); - particle_container.swapId( buffer_uint64 ); - buffer_uint64.resize( 0 ); - } + particle_container.swap( particle_to_inject ); // Recompute bins computeBinIndex( particle_container ); diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index f1ec4ad8b..0bb254cef 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -137,6 +137,19 @@ class nvidiaParticles : public Particles nvidia_id_.swap( new_vector ); }; + void swap( nvidiaParticles &p ) { + for( int idim = 0; idim < dimension(); idim++ ) { + swapPosition( idim, p.nvidia_position_[idim] ); + } + for( int idim = 0; idim < 3; idim++ ) { + swapMomentum( idim, p.nvidia_momentum_[idim] ); + } + swapWeight( p.nvidia_weight_ ); + swapCharge( p.nvidia_charge_ ); + if( tracked ) { + swapId( p.nvidia_id_ ); + } + }; // ----------------------------------------------------------------------------- //! Move leaving particles to the buffers From 227811ca0f496f7d5856cf70ed4791f4e0e066b4 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Thu, 16 May 2024 22:17:22 +0200 Subject: [PATCH 25/54] huge simplification of nvidiaParticles using thrust asynchronism --- src/Particles/Particles.cpp | 3 +- src/Particles/Particles.h | 8 +- src/Particles/nvidiaParticles.cu | 872 ++++++------------------------- src/Particles/nvidiaParticles.h | 64 +-- src/Species/Species.cpp | 4 +- 5 files changed, 205 insertions(+), 746 deletions(-) diff --git a/src/Particles/Particles.cpp b/src/Particles/Particles.cpp index 34eaeb161..d4eea30e9 100755 --- a/src/Particles/Particles.cpp +++ b/src/Particles/Particles.cpp @@ -1398,10 +1398,9 @@ int Particles::eraseLeavingParticles() return 0; } -int Particles::injectParticles( Particles *particles_to_inject ) +void Particles::copyParticles( Particles* particles_to_inject ) { ERROR( "Device only feature, should not have come here! On CPU it's done in sortParticles." ); - return 0; } void Particles::importAndSortParticles( Particles *particles_to_inject ) diff --git a/src/Particles/Particles.h b/src/Particles/Particles.h index c0e5958e3..91689ef3f 100755 --- a/src/Particles/Particles.h +++ b/src/Particles/Particles.h @@ -484,11 +484,9 @@ class Particles virtual int eraseLeavingParticles(); // ----------------------------------------------------------------------------- - //! Inject particles from particles_to_inject object and put - //! them in the Particles object - //! \param[in,out] particles_to_inject Particles object containing particles to inject - virtual int injectParticles( Particles *particles_to_inject ); - + //! Resize & Copy particles from particles_to_inject to the end of the vectors + virtual void copyParticles( Particles* particles_to_inject ); + //! Implementation of a somewhat efficient particle injection, sorting //! (including removing leaving particles) and binning for GPU if //! available for the configuration of offloading technology diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index 85e5a5bf7..617cb0851 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -46,21 +46,13 @@ struct cellKeyEquals } }; -struct cellKeyNegative +template +struct cellKeyBelow { constexpr __host__ __device__ bool operator()( const int& x ) const { - return x < 0; - } -}; - -struct cellKeyBelowMinus1 -{ - constexpr __host__ __device__ bool - operator()( const int& x ) const - { - return x < -1; + return x < key; } }; @@ -91,12 +83,6 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ); - //! Sort the particle on GPU by their cluster/cell key. - //! - static inline void - sortParticleByKey( nvidiaParticles& particle_container, - const Params& parameters ); - //! precondition: //! - nvidia_cell_keys_ shall be sorted in non decreasing order //! - last_index.data() is a pointer mapped to GPU via @@ -127,22 +113,6 @@ namespace detail { InputIterator last, ClusterType cluster_type ); - template - static void - doSortParticleByKey( RandomAccessIterator0 key_first, - RandomAccessIterator0 key_last, - RandomAccessIterator1 value_first ); - - template - static void - doImportAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - ClusterType cluster_type, - ParticleIteratorProvider particle_iterator_provider, - ParticleNoKeyIteratorProvider particle_no_key_iterator_provider ); }; @@ -155,8 +125,8 @@ namespace detail { double inverse_y_cell_dimension, SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, - int CellStartingGlobalIndex_for_x, - int CellStartingGlobalIndex_for_y); + int CellStartingGlobalIndex_for_x, + int CellStartingGlobalIndex_for_y); //! Compute the cell key of a_particle. a_particle shall be a tuple (from a //! zipiterator). @@ -174,21 +144,11 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ); - static void - sortParticleByKey( nvidiaParticles& particle_container, - const Params& parameters ); - - static void - importAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - const Params& parameters, - const Patch& a_parent_patch ); - public: double inverse_of_x_cell_dimension_; double inverse_of_y_cell_dimension_; SizeType local_y_dimension_in_cluster_; - int CellStartingGlobalIndex_for_x_; + int CellStartingGlobalIndex_for_x_; int CellStartingGlobalIndex_for_y_; }; @@ -203,7 +163,7 @@ namespace detail { SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, SizeType local_z_dimension_in_cell, - int CellStartingGlobalIndex_for_x, + int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y, int CellStartingGlobalIndex_for_z); @@ -223,16 +183,6 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ); - static void - sortParticleByKey( nvidiaParticles& particle_container, - const Params& parameters ); - - static void - importAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - const Params& parameters, - const Patch& a_parent_patch ); - public: double inverse_of_x_cell_dimension_; double inverse_of_y_cell_dimension_; @@ -240,7 +190,7 @@ namespace detail { SizeType local_y_dimension_in_cluster_; SizeType local_z_dimension_in_cluster_; int CellStartingGlobalIndex_for_x_; - int CellStartingGlobalIndex_for_y_; + int CellStartingGlobalIndex_for_y_; int CellStartingGlobalIndex_for_z_; }; @@ -270,47 +220,6 @@ namespace detail { }; - //! This functor checks the cluster key of a_particle. - //! - template - struct OutOfClusterPredicate - { - public: - public: - OutOfClusterPredicate( ClusterType cluster_type ) - : cluster_type_{ cluster_type } - { - // EMPTY - } - - template - __host__ __device__ bool - operator()( const Tuple& a_particle ) const - { - // NOTE: its ub to set the cluster key to wrongly keyed particles - // now.. - return thrust::get<0>( a_particle ) /* cluster key */ != cluster_type_.Index( a_particle ); - } - - protected: - ClusterType cluster_type_; - }; - - - //! If the particle's cell/cluster key is -1 it means that it needs to be - //! evicted. - //! - struct OutOfBoundaryPredicate - { - template - __host__ __device__ bool - operator()( const Tuple& a_particle ) const - { - return thrust::get<0>( a_particle ) /* cluster key */ < 0; - } - }; - - //////////////////////////////////////////////////////////////////////////////// // Cluster manipulation functor method definitions //////////////////////////////////////////////////////////////////////////////// @@ -343,31 +252,6 @@ namespace detail { } } - inline void - Cluster::sortParticleByKey( nvidiaParticles& particle_container, - const Params& parameters ) - { - // This is where we do a runtime dispatch depending on the simulation's - // dimensions. - - switch( particle_container.dimension() ) { - case 2: { - Cluster2D::sortParticleByKey( particle_container, - parameters ); - break; - } - case 3: { - Cluster3D::sortParticleByKey( particle_container, - parameters ); - break; - } - default: - // Not implemented, only Cartesian 2D or 3D for the moment - SMILEI_ASSERT( false ); - break; - } - } - inline void Cluster::computeBinIndex( nvidiaParticles& particle_container ) { @@ -408,78 +292,10 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ) { - // This is where we do a runtime dispatch depending on the simulation's - // dimensions. - - switch( particle_container.dimension() ) { - case 2: { - Cluster2D::importAndSortParticles( particle_container, - particle_to_inject, - parameters, - a_parent_patch ); - break; - } - case 3: { - Cluster3D::importAndSortParticles( particle_container, - particle_to_inject, - parameters, - a_parent_patch ); - break; - } - - default: - // Not implemented, only 2D for the moment - SMILEI_ASSERT( false ); - break; - } - } - - template - void - Cluster::doComputeParticleClusterKey( InputIterator first, - InputIterator last, - ClusterType cluster_type ) - { - thrust::for_each( thrust::device, - first, last, - AssignClusterIndex{ cluster_type } ); - } - - template - void - Cluster::doSortParticleByKey( RandomAccessIterator0 key_first, - RandomAccessIterator0 key_last, - RandomAccessIterator1 value_first ) - { - thrust::sort_by_key( thrust::device, - key_first, key_last, - value_first ); - } - - template - void - Cluster::doImportAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - ClusterType cluster_type, - ParticleIteratorProvider particle_iterator_provider, - ParticleNoKeyIteratorProvider particle_no_key_iterator_provider ) - { - const auto first_particle = particle_iterator_provider( particle_container ); - auto last_particle = first_particle + particle_container.deviceSize(); - // Remove out of bound particles - // Using more memory, we could use the faster remove_copy_if - // NOTE: remove_if is stable. - last_particle = thrust::remove_if( thrust::device, - first_particle, - last_particle, - OutOfBoundaryPredicate{} ); + const auto erased_count = particle_container.eraseParticlesByPredicate( cellKeyBelow<0>() ); - const auto initial_count = std::distance( first_particle, last_particle ); + const auto initial_count = particle_container.deviceSize() - erased_count; const auto inject_count = particle_to_inject.deviceSize(); const auto new_count = initial_count + inject_count; @@ -492,38 +308,17 @@ namespace detail { particle_container.resize( new_count ); // Combine imported particles to main particles - const auto first = particle_no_key_iterator_provider( particle_container ); - const auto first_to_inject = particle_no_key_iterator_provider( particle_to_inject ); - thrust::copy( thrust::device, - first_to_inject, - first_to_inject + inject_count, - first + initial_count ); + particle_container.copyParticles( &particle_to_inject, initial_count ); - // Compute keys of imported particles - const auto first_new = particle_iterator_provider( particle_container ); - doComputeParticleClusterKey( first_new, first_new + new_count, cluster_type ); + // Compute keys of particles + computeParticleClusterKey( particle_container, parameters, a_parent_patch ); - // Make a sorting map using the cell keys (like numpy.argsort) - thrust::device_vector particle_index( new_count ); - thrust::counting_iterator iter( 0 ); - thrust::copy(iter, iter + new_count, particle_index.begin()); - thrust::sort_by_key( thrust::device, - particle_container.getPtrCellKeys(), - particle_container.getPtrCellKeys() + new_count, - particle_index.begin() ); - - + // Use particle_to_inject as a buffer particle_to_inject.softReserve( new_count ); particle_to_inject.resize( new_count ); - const auto first_unsorted = particle_no_key_iterator_provider( particle_container ); - const auto first_buffer = particle_no_key_iterator_provider( particle_to_inject ); - thrust::gather( thrust::device, - particle_index.begin(), particle_index.end(), - first_unsorted, - first_buffer ); - - particle_container.swap( particle_to_inject ); + // Sort particles using thrust::gather, according to the sorting map + particle_container.sortParticleByKey( particle_to_inject ); // Recompute bins computeBinIndex( particle_container ); @@ -536,6 +331,17 @@ namespace detail { // particle_to_inject.free(); } + template + void + Cluster::doComputeParticleClusterKey( InputIterator first, + InputIterator last, + ClusterType cluster_type ) + { + thrust::for_each( thrust::device, + first, last, + AssignClusterIndex{ cluster_type } ); + } //////////////////////////////////////////////////////////////////////////////// // Cluster2D method definitions @@ -546,12 +352,12 @@ namespace detail { double inverse_y_cell_dimension, SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, - int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y ) + int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y ) : inverse_of_x_cell_dimension_{ inverse_x_cell_dimension } , inverse_of_y_cell_dimension_{ inverse_y_cell_dimension } , local_y_dimension_in_cluster_{ local_y_dimension_in_cell / kClusterWidth } , CellStartingGlobalIndex_for_x_{CellStartingGlobalIndex_for_x} - , CellStartingGlobalIndex_for_y_{CellStartingGlobalIndex_for_y} + , CellStartingGlobalIndex_for_y_{CellStartingGlobalIndex_for_y} { // EMPTY } @@ -563,7 +369,7 @@ namespace detail { SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, SizeType local_z_dimension_in_cell, - int CellStartingGlobalIndex_for_x, + int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y, int CellStartingGlobalIndex_for_z ) : inverse_of_x_cell_dimension_{ inverse_x_cell_dimension } , inverse_of_y_cell_dimension_{ inverse_y_cell_dimension } @@ -662,7 +468,7 @@ namespace detail { const auto last = first + particle_container.deviceSize(); int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); - doComputeParticleClusterKey( first, last, + doComputeParticleClusterKey( first, last, Cluster2D{ parameters.res_space[0], parameters.res_space[1], parameters.patch_size_[0], @@ -685,7 +491,7 @@ namespace detail { int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); int CellStartingGlobalIndex_for_z = a_parent_patch.getCellStartingGlobalIndex_noGC(2); - doComputeParticleClusterKey( first, last, + doComputeParticleClusterKey( first, last, Cluster3D{ parameters.res_space[0], parameters.res_space[1], parameters.res_space[2], @@ -697,277 +503,6 @@ namespace detail { CellStartingGlobalIndex_for_z } ); } - template - void - Cluster2D::sortParticleByKey( nvidiaParticles& particle_container, - const Params& ) - { - // This is where we do a runtime dispatch depending on the simulation's - // qed/radiation settings. - - // NOTE: For now we support dont support qed/radiations. Performance - // comes from specialization. - - // TODO(Etienne M): Find a better way to dispatch at runtime. This is - // complex to read and to maintain. - - if( particle_container.has_quantum_parameter ) { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - SMILEI_ASSERT( false ); - } - } else { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - // The appropriate thrust::zip_iterator for the current - // simulation's parameters - - const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - - doSortParticleByKey( particle_container.getPtrCellKeys(), - particle_container.getPtrCellKeys() + particle_container.deviceSize(), - value_first ); - } - } - } - - template - void - Cluster3D::sortParticleByKey( nvidiaParticles& particle_container, - const Params& ) - { - // This is where we do a runtime dispatch depending on the simulation's - // qed/radiation settings. - - // NOTE: For now we support dont support qed/radiations. Performance - // comes from specialization. - - // TODO(Etienne M): Find a better way to dispatch at runtime. This is - // complex to read and to maintain. - - if( particle_container.has_quantum_parameter ) { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - SMILEI_ASSERT( false ); - } - } else { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - // The appropriate thrust::zip_iterator for the current - // simulation's parameters - - if (particle_container.tracked) { - const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge(), - particle_container.getPtrId() ) ); - doSortParticleByKey( particle_container.getPtrCellKeys(), - particle_container.getPtrCellKeys() + particle_container.deviceSize(), - value_first ); - - } - else { - const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - doSortParticleByKey( particle_container.getPtrCellKeys(), - particle_container.getPtrCellKeys() + particle_container.deviceSize(), - value_first ); - } - } - } - } - - template - void - Cluster2D::importAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - const Params& parameters, - const Patch& a_parent_patch ) - { - // This is where we do a runtime dispatch depending on the simulation's - // qed/radiation settings. - - // NOTE: For now we support dont support qed/radiations. Performance - // comes from specialization. - - // TODO(Etienne M): Find a better way to dispatch at runtime. This is - // complex to read and to maintain. - int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); - int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); - - const Cluster2D cluster_manipulator{ parameters.res_space[0], - parameters.res_space[1], - parameters.patch_size_[0], - parameters.patch_size_[1], - CellStartingGlobalIndex_for_x, CellStartingGlobalIndex_for_y}; - - if( particle_container.has_quantum_parameter ) { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - SMILEI_ASSERT( false ); - } - } else { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - // Returns the appropriate thrust::zip_iterator for the - // current simulation's parameters - const auto particle_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrCellKeys(), - particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - }; - - const auto particle_no_key_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - }; - - doImportAndSortParticles( particle_container, - particle_to_inject, - cluster_manipulator, - particle_iterator_provider, - particle_no_key_iterator_provider ); - } - } - } - - template - void - Cluster3D::importAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - const Params& parameters, - const Patch& a_parent_patch ) - { - // This is where we do a runtime dispatch depending on the simulation's - // qed/radiation settings. - - // NOTE: For now we support dont support qed/radiations. Performance - // comes from specialization. - - // TODO(Etienne M): Find a better way to dispatch at runtime. This is - // complex to read and to maintain. - int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); - int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); - int CellStartingGlobalIndex_for_z = a_parent_patch.getCellStartingGlobalIndex_noGC(2); - - const Cluster3D cluster_manipulator{ parameters.res_space[0], - parameters.res_space[1], - parameters.res_space[2], - parameters.patch_size_[0], - parameters.patch_size_[1], - parameters.patch_size_[2], - CellStartingGlobalIndex_for_x, - CellStartingGlobalIndex_for_y, CellStartingGlobalIndex_for_z}; - - if( particle_container.has_quantum_parameter ) { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - SMILEI_ASSERT( false ); - } - } else { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - // Returns the appropriate thrust::zip_iterator for the - // current simulation's parameters - if (particle_container.tracked) { - const auto particle_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrCellKeys(), - particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge(), - particle_container.getPtrId() ) ); - }; - const auto particle_no_key_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge(), - particle_container.getPtrId() ) ); - }; - doImportAndSortParticles( particle_container, - particle_to_inject, - cluster_manipulator, - particle_iterator_provider, - particle_no_key_iterator_provider ); - } - else { - const auto particle_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrCellKeys(), - particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - }; - - const auto particle_no_key_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - }; - - doImportAndSortParticles( particle_container, - particle_to_inject, - cluster_manipulator, - particle_iterator_provider, - particle_no_key_iterator_provider ); - } - } - } - } - } // namespace detail @@ -1270,7 +805,7 @@ void nvidiaParticles::initializeDataOnDevice() detail::Cluster::computeParticleClusterKey( *this, *parameters_, *parent_patch_ ); // The particles are not correctly sorted when created. - detail::Cluster::sortParticleByKey( *this, *parameters_ ); + sortParticleByKey(); detail::Cluster::computeBinIndex( *this ); setHostBinIndex(); @@ -1365,7 +900,7 @@ unsigned int nvidiaParticles::deviceCapacity() const // ----------------------------------------------------------------------------- void nvidiaParticles::copyLeavingParticlesToBuffer( Particles* buffer ) { - copyParticlesByPredicate( buffer, cellKeyBelowMinus1() ); + copyParticlesByPredicate( buffer, cellKeyBelow<-1>() ); buffer->copyFromDeviceToHost( true ); } @@ -1379,86 +914,64 @@ void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pre // and keep the good ones. This would help us avoid the std::remove_if in // the particle injection and sorting algorithm. - const int nparts = gpu_nparts_; - // Iterator of the main data structure - // NOTE: https://nvidia.github.io/thrust/api/classes/classthrust_1_1zip__iterator.html#class-thrustzip_iterator - const auto source_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), - nvidia_momentum_[0].begin(), - nvidia_momentum_[1].begin(), - nvidia_momentum_[2].begin(), - nvidia_weight_.begin(), - nvidia_charge_.begin(), - nvidia_cell_keys_.begin() ) ); - const auto source_iterator_last = source_iterator_first + nparts; // std::advance + // Count particles satisfying the predicate + const auto keys = getPtrCellKeys(); + const int nparts_to_copy = thrust::count_if( thrust::device, keys, keys + gpu_nparts_, pred ); - nvidiaParticles* const cp_parts = static_cast( buffer ); + // Resize destination buffer (copy_if does not resize) + nvidiaParticles* const dest = static_cast( buffer ); + dest->resize( nparts_to_copy ); - const int nparts_to_copy = thrust::count_if( thrust::device, - nvidia_cell_keys_.cbegin(), - nvidia_cell_keys_.cbegin() + nparts, - pred ); - - // Resize it, if too small (copy_if do not resize) - cp_parts->resize( nparts_to_copy ); - - const auto destination_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( cp_parts->nvidia_position_[0].begin(), - cp_parts->nvidia_momentum_[0].begin(), - cp_parts->nvidia_momentum_[1].begin(), - cp_parts->nvidia_momentum_[2].begin(), - cp_parts->nvidia_weight_.begin(), - cp_parts->nvidia_charge_.begin(), - cp_parts->nvidia_cell_keys_.begin() ) ); - - // Copy send particles in dedicated data structure - thrust::copy_if( thrust::device, - source_iterator_first, - source_iterator_last, - nvidia_cell_keys_.cbegin(), - destination_iterator_first, - pred ); - - // Copy the other position values depending on the simulation's grid dimensions - const int ndim_particles = nvidia_position_.size(); - for( int i = 1; i < ndim_particles; ++i ) { - thrust::copy_if( thrust::device, - nvidia_position_[i].cbegin(), - nvidia_position_[i].cbegin() + nparts, - nvidia_cell_keys_.cbegin(), - cp_parts->nvidia_position_[i].begin(), - pred ); - } - - // Special treatment for chi if radiation emission - if( has_quantum_parameter ) { - thrust::copy_if( thrust::device, - nvidia_chi_.cbegin(), - nvidia_chi_.cbegin() + nparts, - nvidia_cell_keys_.cbegin(), - cp_parts->nvidia_chi_.begin(), - pred ); + if( nparts_to_copy ) { + // Copy the particles to the destination + for( int ip = 0; ip < getNDoubleProp(); ip++ ) { + const auto in = getPtrDoubleProp( ip ); + const auto out = dest->getPtrDoubleProp( ip ); + thrust::copy_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, out, pred ); + } + for( int ip = 0; ip < getNShortProp(); ip++ ) { + const auto in = getPtrShortProp( ip ); + const auto out = dest->getPtrShortProp( ip ); + thrust::copy_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, out, pred ); + } + if( tracked ) { + const auto in = getPtrId(); + const auto out = dest->getPtrId(); + thrust::copy_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, out, pred ); + } + cudaDeviceSynchronize(); } +} - if( has_Monte_Carlo_process ) { - thrust::copy_if( thrust::device, - nvidia_tau_.cbegin(), - nvidia_tau_.cbegin() + nparts, - nvidia_cell_keys_.cbegin(), - cp_parts->nvidia_tau_.begin(), - pred ); - } +void nvidiaParticles::copyParticles( Particles* particles_to_inject ) +{ + const auto nparts = gpu_nparts_; + nvidiaParticles* to_inject = static_cast( particles_to_inject ); + resize( nparts + to_inject->gpu_nparts_ ); + copyParticles( to_inject, nparts ); +} +void nvidiaParticles::copyParticles( nvidiaParticles* particles_to_inject, size_t offset ) +{ + // Copy the particles to the destination + for( int ip = 0; ip < getNDoubleProp(); ip++ ) { + const auto in = particles_to_inject->getPtrDoubleProp( ip ); + const auto out = getPtrDoubleProp( ip ); + thrust::copy_n( thrust::cuda::par_nosync, in, particles_to_inject->gpu_nparts_, out + offset ); + } + for( int ip = 0; ip < getNShortProp(); ip++ ) { + const auto in = particles_to_inject->getPtrShortProp( ip ); + const auto out = getPtrShortProp( ip ); + thrust::copy_n( thrust::cuda::par_nosync, in, particles_to_inject->gpu_nparts_, out + offset ); + } if( tracked ) { - thrust::copy_if( thrust::device, - nvidia_id_.cbegin(), - nvidia_id_.cbegin() + nparts, - nvidia_cell_keys_.cbegin(), - cp_parts->nvidia_id_.begin(), - pred ); + const auto in = particles_to_inject->getPtrId(); + const auto out = getPtrId(); + thrust::copy_n( thrust::cuda::par_nosync, in, particles_to_inject->gpu_nparts_, out + offset ); } - + cudaDeviceSynchronize(); } - // ----------------------------------------------------------------------------- //! Erase `npart` particles from `ipart` // ----------------------------------------------------------------------------- @@ -1484,157 +997,43 @@ void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pre //} // ----------------------------------------------------------------------------- -//! Erase particles leaving the patch object on device +//! Erase particles leaving the patch on device // ----------------------------------------------------------------------------- int nvidiaParticles::eraseLeavingParticles() { - return eraseParticlesByPredicate( cellKeyNegative() ); + const auto nremoved = eraseParticlesByPredicate( cellKeyBelow<0>() ); + resize( gpu_nparts_ - nremoved ); + return nremoved; } +//! "Erase" particles but does not resize the arrays! template int nvidiaParticles::eraseParticlesByPredicate( Predicate pred ) { - const int position_dimension_count = nvidia_position_.size(); - const int nparts = gpu_nparts_; - const int nparts_to_remove = thrust::count_if( thrust::device, - nvidia_cell_keys_.begin(), - nvidia_cell_keys_.begin() + nparts, - pred ); - - if( nparts_to_remove > 0 ) { - const auto first_particle = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), - nvidia_momentum_[0].begin(), - nvidia_momentum_[1].begin(), - nvidia_momentum_[2].begin(), - nvidia_weight_.begin(), - nvidia_charge_.begin() ) ); - - const auto last_particle = first_particle + nparts; - - // Remove particles which leaves current patch - thrust::remove_if( thrust::device, - first_particle, - last_particle, - nvidia_cell_keys_.cbegin(), - pred ); - - // Remove the other position values depending on the simulation's grid - // dimensions - for( int i = 1; i < position_dimension_count; ++i ) { - thrust::remove_if( thrust::device, - nvidia_position_[i].begin(), - nvidia_position_[i].begin() + nparts, - nvidia_cell_keys_.cbegin(), - pred ); - } - - if( has_quantum_parameter ) { - thrust::remove_if( thrust::device, - nvidia_chi_.begin(), - nvidia_chi_.begin() + nparts, - nvidia_cell_keys_.cbegin(), - pred ); - } - - if( has_Monte_Carlo_process ) { - thrust::remove_if( thrust::device, - nvidia_tau_.begin(), - nvidia_tau_.begin() + nparts, - nvidia_cell_keys_.cbegin(), - pred ); - } - - if( tracked ) { - thrust::remove_if( thrust::device, - nvidia_id_.begin(), - nvidia_id_.begin() + nparts, - nvidia_cell_keys_.cbegin(), - pred ); - } - - // Update current number of particles - gpu_nparts_ -= nparts_to_remove; - - // Resize data structures (remove_if does not resize) - resize( gpu_nparts_ ); - } - - return nparts_to_remove; -} - -int nvidiaParticles::injectParticles( Particles* particles_to_inject ) -{ - const int nparts = gpu_nparts_; - - // Manage the recv data structure - nvidiaParticles* const cp_parts = static_cast( particles_to_inject ); - - const int nparts_add = cp_parts->gpu_nparts_; - const int tot_parts = nparts + nparts_add; - - const int position_dimension_count = nvidia_position_.size(); - - // Resize main data structure, if too small (copy_n do not resize) - resize( tot_parts ); - - const auto source_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( cp_parts->nvidia_position_[0].cbegin(), - cp_parts->nvidia_momentum_[0].cbegin(), - cp_parts->nvidia_momentum_[1].cbegin(), - cp_parts->nvidia_momentum_[2].cbegin(), - cp_parts->nvidia_weight_.cbegin(), - cp_parts->nvidia_charge_.cbegin() ) ); - - // Iterator of the main data structure (once it has been resized) - const auto destination_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), - nvidia_momentum_[0].begin(), - nvidia_momentum_[1].begin(), - nvidia_momentum_[2].begin(), - nvidia_weight_.begin(), - nvidia_charge_.begin() ) ) + - nparts; - - // Copy recv particles in main data structure - thrust::copy_n( thrust::device, - source_iterator_first, - nparts_add, - destination_iterator_first ); - - // Remove the other position values depending on the simulation's grid - // dimensions - for( int i = 1; i < position_dimension_count; ++i ) { - thrust::copy_n( thrust::device, - cp_parts->nvidia_position_[i].cbegin(), - nparts_add, - nvidia_position_[i].begin() + nparts ); - } - - if( has_quantum_parameter ) { - thrust::copy_n( thrust::device, - cp_parts->nvidia_chi_.cbegin(), - nparts_add, - nvidia_chi_.begin() + nparts ); + const auto keys = getPtrCellKeys(); + const int nparts_to_remove = thrust::count_if( thrust::device, keys, keys + gpu_nparts_, pred ); + + // Copy the particles to the destination + // Using more memory, we could use the faster remove_copy_if + // NOTE: remove_if is stable. + for( int ip = 0; ip < getNDoubleProp(); ip++ ) { + const auto in = getPtrDoubleProp( ip ); + thrust::remove_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, pred ); } - - if( has_Monte_Carlo_process ) { - thrust::copy_n( thrust::device, - cp_parts->nvidia_tau_.cbegin(), - nparts_add, - nvidia_tau_.begin() + nparts ); + for( int ip = 0; ip < getNShortProp(); ip++ ) { + const auto in = getPtrShortProp( ip ); + thrust::remove_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, pred ); } - if( tracked ) { - thrust::copy_n( thrust::device, - cp_parts->nvidia_id_.cbegin(), - nparts_add, - nvidia_id_.begin() + nparts ); + const auto in = getPtrId(); + thrust::remove_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, pred ); } - - // No more particles to move - cp_parts->resize( 0 ); - - return nparts_add; + cudaDeviceSynchronize(); + + return nparts_to_remove; } + // --------------------------------------------------------------------------------------------------------------------- //! Create n_additional_particles new particles at the end of vectors //! Fill the new elements with 0 @@ -1684,6 +1083,60 @@ void nvidiaParticles::importAndSortParticles( Particles* particles_to_inject ) setHostBinIndex(); } +//! Sort by cell_keys_ +//! This version synchronizes for every vector, but uses less buffers +void nvidiaParticles::sortParticleByKey() +{ + // Make a sorting map using the cell keys (like numpy.argsort) + thrust::device_vector index( gpu_nparts_ ); + thrust::sequence( thrust::device, index.begin(), index.end() ); + thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); + + // Sort particles using thrust::gather, according to the sorting map + thrust::device_vector buffer( gpu_nparts_ ); + for( int ip = 0; ip < getNDoubleProp(); ip++ ) { + thrust::gather( thrust::device, index.begin(), index.end(), getPtrDoubleProp( ip ), buffer.begin() ); + swapDoubleProp( ip, buffer ); + } + buffer.clear(); + thrust::device_vector buffer_short( gpu_nparts_ ); + for( int ip = 0; ip < getNShortProp(); ip++ ) { + thrust::gather( thrust::device, index.begin(), index.end(), getPtrShortProp( ip ), buffer_short.begin() ); + swapShortProp( ip, buffer_short ); + } + buffer_short.clear(); + if( tracked ) { + thrust::device_vector buffer_uint64( gpu_nparts_ ); + thrust::gather( thrust::device, index.begin(), index.end(), getPtrId(), buffer_uint64.begin() ); + swapId( buffer_uint64 ); + buffer_uint64.clear(); + } +} + +//! Sort by cell_keys_ +//! This version is asynchronous, but requires a buffer of equal size to be provided +void nvidiaParticles::sortParticleByKey( nvidiaParticles& buffer ) +{ + // Make a sorting map using the cell keys (like numpy.argsort) + thrust::device_vector index( gpu_nparts_ ); + thrust::sequence( thrust::device, index.begin(), index.end() ); + thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); + + // Sort particles using thrust::gather, according to the sorting map + for( int ip = 0; ip < getNDoubleProp(); ip++ ) { + thrust::gather( thrust::cuda::par_nosync, index.begin(), index.end(), getPtrDoubleProp( ip ), buffer.getPtrDoubleProp( ip ) ); + } + for( int ip = 0; ip < getNShortProp(); ip++ ) { + thrust::gather( thrust::cuda::par_nosync, index.begin(), index.end(), getPtrShortProp( ip ), buffer.getPtrShortProp( ip ) ); + } + if( tracked ) { + thrust::gather( thrust::cuda::par_nosync, index.begin(), index.end(), getPtrId(), buffer.getPtrId() ); + } + cudaDeviceSynchronize(); + + swap( buffer ); +} + int nvidiaParticles::prepareBinIndex() { if( first_index.size() == 0 ) { @@ -1747,7 +1200,10 @@ void nvidiaParticles::naiveImportAndSortParticles( nvidiaParticles* particles_to eraseLeavingParticles(); // Inject newly arrived particles in particles_to_inject - injectParticles( particles_to_inject ); + const size_t current_size = gpu_nparts_; + resize( current_size + particles_to_inject->size() ); + copyParticles( particles_to_inject, current_size ); + particles_to_inject->clear(); } extern "C" diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index 0bb254cef..906d3709e 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -111,45 +111,42 @@ class nvidiaParticles : public Particles uint64_t * getPtrId() override { return thrust::raw_pointer_cast( nvidia_id_.data() ); }; - - void swapPosition( int idim, thrust::device_vector &new_vector ) { - nvidia_position_[idim].swap( new_vector ); - }; - void swapMomentum( int idim, thrust::device_vector &new_vector ) { - nvidia_momentum_[idim].swap( new_vector ); + + size_t getNDoubleProp() { + return nvidia_double_prop_.size(); }; - void swapWeight( thrust::device_vector &new_vector ) { - nvidia_weight_.swap( new_vector ); + size_t getNShortProp() { + return nvidia_short_prop_.size(); }; - void swapChi( thrust::device_vector &new_vector ) { - nvidia_chi_.swap( new_vector ); + + double * getPtrDoubleProp( int iprop ) { + return thrust::raw_pointer_cast( nvidia_double_prop_[iprop]->data() ); }; - void swapCharge( thrust::device_vector &new_vector ) { - nvidia_charge_.swap( new_vector ); + short * getPtrShortProp( int iprop ) { + return thrust::raw_pointer_cast( nvidia_short_prop_[iprop]->data() ); }; - void swapTau( thrust::device_vector &new_vector ) { - nvidia_tau_.swap( new_vector ); + + void swapDoubleProp( int iprop, thrust::device_vector &new_vector ) { + nvidia_double_prop_[iprop]->swap( new_vector ); }; - void swapCellKeys( thrust::device_vector &new_vector ) { - nvidia_cell_keys_.swap( new_vector ); + void swapShortProp( int iprop, thrust::device_vector &new_vector ) { + nvidia_short_prop_[iprop]->swap( new_vector ); }; void swapId( thrust::device_vector &new_vector ) { nvidia_id_.swap( new_vector ); }; - void swap( nvidiaParticles &p ) { - for( int idim = 0; idim < dimension(); idim++ ) { - swapPosition( idim, p.nvidia_position_[idim] ); + void swap( nvidiaParticles & p ) { + for( int iprop = 0; iprop < getNDoubleProp(); iprop++ ) { + nvidia_double_prop_[iprop]->swap( *p.nvidia_double_prop_[iprop] ); } - for( int idim = 0; idim < 3; idim++ ) { - swapMomentum( idim, p.nvidia_momentum_[idim] ); + for( int iprop = 0; iprop < getNShortProp(); iprop++ ) { + nvidia_short_prop_[iprop]->swap( *p.nvidia_short_prop_[iprop] ); } - swapWeight( p.nvidia_weight_ ); - swapCharge( p.nvidia_charge_ ); if( tracked ) { - swapId( p.nvidia_id_ ); + nvidia_id_.swap( p.nvidia_id_ ); } - }; + } // ----------------------------------------------------------------------------- //! Move leaving particles to the buffers @@ -158,6 +155,12 @@ class nvidiaParticles : public Particles template void copyParticlesByPredicate( Particles* buffer, Predicate pred ); + + //! Resize & Copy particles from particles_to_inject to end of vectors + void copyParticles( Particles* particles_to_inject ) override; + + //! Copy particles from particles_to_inject to specific offset + void copyParticles( nvidiaParticles* particles_to_inject, size_t offset ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device and returns the number of particle removed @@ -167,11 +170,6 @@ class nvidiaParticles : public Particles template int eraseParticlesByPredicate( Predicate pred ); - // ----------------------------------------------------------------------------- - //! Inject particles from particles_to_inject into *this and return the number of particle added - // ----------------------------------------------------------------------------- - int injectParticles( Particles* particles_to_inject ) override; - // --------------------------------------------------------------------------------------------------------------------- //! Create n_additional_particles new particles at the end of vectors //! Fill the new elements with 0 @@ -181,6 +179,12 @@ class nvidiaParticles : public Particles //! See the Particles class for documentation. void importAndSortParticles( Particles* particles_to_inject ) override; + //! Sort by cell_keys_ + //! This version synchronizes for every vector, but uses less buffers + void sortParticleByKey(); + //! This version is asynchronous, but requires a buffer of equal size to be provided + void sortParticleByKey( nvidiaParticles& buffer ); + protected: //! Redefine first_index and last_index according to the binning algorithm //! used on GPU. diff --git a/src/Species/Species.cpp b/src/Species/Species.cpp index 65358f555..19b39c2ed 100755 --- a/src/Species/Species.cpp +++ b/src/Species/Species.cpp @@ -2103,8 +2103,10 @@ void Species::importParticles( Params ¶ms, Patch *patch, Particles &source_p // Warning: the current GPU version does not handle tracked particles // Inject particles from source_particles - particles->last_index.back() += particles->injectParticles( &source_particles ); + particles->copyParticles( &source_particles ); + particles->last_index.back() += source_particles.size(); particles->last_index[0] = particles->last_index.back(); + source_particles.clear(); #else // --------------------------------------------------- From 0b7d91e7b0592bbf3f7c50557145f3d443e5d27d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brian=20Edward=20Marr=C3=A9?= Date: Fri, 17 May 2024 11:50:01 +0200 Subject: [PATCH 26/54] fix for documentation typos (#716) --- doc/Sphinx/implementation.rst | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/doc/Sphinx/implementation.rst b/doc/Sphinx/implementation.rst index 46bf953e9..c524cb560 100644 --- a/doc/Sphinx/implementation.rst +++ b/doc/Sphinx/implementation.rst @@ -10,10 +10,10 @@ and conveniency for non-advanced C++ users. The repository is composed of the following directories: - ``Licence``: contains code licence information -- ``doc``: conatins the Sphinx doc files +- ``doc``: contains the Sphinx doc files - ``src``: contains all source files - ``happi``: contains the sources of the happi Python tool for visualization -- ``benchmarks``: contains the benchmarks used by the validation process. these becnhamrks are also examples for users. +- ``benchmarks``: contains the benchmarks used by the validation process, these benchmarks are also examples for users. - ``scripts``: contains multiple tool scripts for compilation and more - ``compile_tools``: contains scripts and machine files used by the makefile for compilation @@ -23,7 +23,7 @@ The repository is composed of the following directories: The source files directory is as well composed of several sub-directories to organise the `.cpp` and `.h` files by related thematics. The main is the file `Smilei.cpp`. -There is always only one class definition per file and the file name correcponds to the class name. +There is always only one class definition per file and the file name corresponds to the class name. The general implementation is later summarized in :numref:`smilei_main_loop` @@ -54,10 +54,10 @@ Notion of operators An operator is a class that operates on input data to provide a processed information. Input data can be parameters and data containers. Output data can be processed data from data containers or updated data containers. -An operator is a class functor (overloadind of the ``()`` ). -Sometime, operator provides additional methods called wrappers to provide differents simplified or adapted interfaces. -An operator do not store data or temporarely. -for instance, the particle interpolation, push and proection are operators. +An operator is a class functor (overloading of the ``()`` ). +Sometime, operator provides additional methods called wrappers to provide different simplified or adapted interfaces. +An operator do not store data or temporarily. +for instance, the particle interpolation, push and protection are operators. .. _operator: @@ -71,7 +71,7 @@ Notion of domain parts Domain parts are classes that represents some specific levels of the domain decomposition. They can be seen as high-level data container or container of data container. -They contain some methods to handle, manange and access the local data. +They contain some methods to handle, manage and access the local data. For instance, patches and ``Species`` are domain parts: - ``Species`` contains the particles. @@ -80,10 +80,10 @@ For instance, patches and ``Species`` are domain parts: Notion of factory ------------------------------------ -Some objects such as operators or data containers have sereral variations. +Some objects such as operators or data containers have several variations. For this we use inheritance. A base class is used for common parameters and methods and derived classes are used for all variations. -The factory uses user-defined input parameters to determine the right derive class to choose and initiate them as shown in :numref:`factory`. +The factory uses user-defined input parameters to determine the right derived class to choose and initiate them as shown in :numref:`factory`. For instance, there are several ``push`` operators implemented all derived from a base ``push`` class. The ``push`` factory will determine the right one to use. @@ -97,7 +97,7 @@ The ``push`` factory will determine the right one to use. Other ------------------------------------ -Some classes are used for specific actions in the code such as the initilization process. +Some classes are used for specific actions in the code such as the initialization process. ----------------------------------------------------------------- @@ -106,7 +106,7 @@ III. Domain decomposition and parallelism The simulation domain is divided multiple times following a succession of decomposition levels. The whole domain is the superimposition of different grids for each electromagnetic field component -and macro-particules. +and macro-particles. Let us represent schematically the domain as an array of cells as in Fig. :numref:`full_domain`. Each cell contains a certain population of particles (that can differ from cell to cell). @@ -127,8 +127,8 @@ The domain becomes a collection of patches as shown in :numref:`patch_domain_dec The domain in :program:`Smilei` is a collection of patches. -A patch is an independant piece of the whole simulation domain. -It therefore owns local electrmognatic grids and list of macro-particles. +A patch is an independent piece of the whole simulation domain. +It therefore owns the local electromagnetic grids and list of macro-particles. Electromagnetic grids have ghost cells that represent the information located in the neighboring patches (not shown in :numref:`patch_domain_decomposition`). All patches have the same spatial size .i.e. the same number of cells. The size of a patch is calculated so that all local field grids (ghost cells included) can fit in L2 cache. @@ -144,7 +144,7 @@ The distribution can be ensured in an equal cartesian way or using a load balanc Patches are then distributed among MPI processes in so-called MPI patch collections. Inside MPI patch collection, OpenMP loop directives are used to distribute the computation of the patches among the available threads. -Since each patch have a different number of particles, this approach enables a dynamic scheduling depending on the specified OpenMP scheduler. +Since each patch has a different number of particles, this approach enables a dynamic scheduling depending on the specified OpenMP scheduler. As shown in :numref:`smilei_main_loop`, a synchronization step is required to exchange grid ghost cells and particles traveling from patch to patch. The patch granularity is used for: @@ -163,7 +163,7 @@ The patch can be decomposed into bins as shown in :numref:`bin_decomposition`. Bin decomposition. -Contrary to patch, a bin is not an independant data structure with its own arrays. +Contrary to patch, a bin is not an independent data structure with its own arrays. It represents a smaller portion of the patch grids through specific start and end indexes. For the macro-particles, a sorting algorithm is used to ensure that in the macro-particles located in the same bin are grouped and contiguous in memory. @@ -288,7 +288,7 @@ located in the file `src/Tools.h`. - `ERROR_NAMELIST`: this function should be used for namelist error. It takes in argument a simple message and a link to the documentation. It throws as well a SIGABRT signal. - `MESSAGE`: this function should be used to output an information message (it uses `std::cout`). - `DEBUG` : should be used for debugging messages (for the so-called DEBUG mode) -- `WARNING` : should be used to thrown a warning. A warning alerts the users of a possible issue or to be carreful with some parameters without stoping the program. +- `WARNING` : should be used to thrown a warning. A warning alerts the users of a possible issue or to be careful with some parameters without stopping the program. -------------------------------------------------------------------------------- @@ -618,7 +618,7 @@ We first loop on the patches and then the species of each patch ``ipatch``: ``(*this )( ipatch )->vecSpecies.size()``. For each species, the method ``Species::dynamics`` is called to perform the dynamic step of the respective particles. -The OpenMP parallelism is explicitely applied in ``vecPatches::dynamics`` on the patch loop as shown +The OpenMP parallelism is explicitly applied in ``vecPatches::dynamics`` on the patch loop as shown in the following pieces of code. .. code-block:: c++ From 5b60a4d771e3ba59e9aba251a5110a0d2366bfb9 Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Sun, 19 May 2024 10:49:22 +0200 Subject: [PATCH 27/54] add article, use extended journal names for each article --- doc/Sphinx/Overview/material.rst | 197 ++++++++++++++++--------------- 1 file changed, 101 insertions(+), 96 deletions(-) diff --git a/doc/Sphinx/Overview/material.rst b/doc/Sphinx/Overview/material.rst index 04973edbf..2d33d6aff 100644 --- a/doc/Sphinx/Overview/material.rst +++ b/doc/Sphinx/Overview/material.rst @@ -30,7 +30,7 @@ Papers involving Smilei ^^^^^^^^^^^^^^^^^^^^^^^^ Only papers published in peer-reviewed journals are listed (for the complete list of citing papers see `Google Scholar `_). -As of May 2024, 186 papers have been published covering a broad range of topics: +As of May 2024, 187 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -50,7 +50,12 @@ Following is the distribution of these topics in the listed publications up to N Use the python script doc/doi2publications.py to generate entries from a DOI number, and paste them here You can count the number of papers in the list with the vim command :%s/.. \[//gn. +.. [Ivanov2024] + K. A. Ivanov, D. A. Gorlova, I. N. Tsymbalov, I. P. Tsygvintsev, S. A. Shulyapov, R. V. Volkov, and A. B. Savel’ev, + `Laser-driven pointed acceleration of electrons with preformed plasma lens`, + `Physical Review Accelerators and Beams 27, 051301 (2024) `_ + .. [Timmis2024] R. J. L. Timmis, R. W. Paddock, I. Ouatu, J. Lee, S. Howard, E. Atonga, R. T. Ruskov, H. Martin, R. H. W. Wang, R. Aboushelbaya, M. W. von der Leyen, E. Gumbrell and P. A. Norreys, @@ -79,7 +84,7 @@ Following is the distribution of these topics in the listed publications up to N M. Luo, C. Riconda, I. Pusztai, A. Grassi, J. S. Wurtele, and T. Fülöp, `Control of autoresonant plasma beat-wave wakefield excitation`, - `Phys. Rev. Research 6, 013338 (2024) `_ + `Physical Review Research 6, 013338 (2024) `_ .. [Krafft2024] @@ -247,7 +252,7 @@ Following is the distribution of these topics in the listed publications up to N E. Starodubtseva, I. Tsymbalov, D. Gorlova, K. Ivanov, and A. Savel'ev, `Low energy electron injection for direct laser acceleration`, - `Phys. Plasmas 30, 083105 (2023) `_ + `Physics of Plasmas 30, 083105 (2023) `_ .. [Maffini2023] @@ -259,7 +264,7 @@ Following is the distribution of these topics in the listed publications up to N S. Yu. Gus'kov, Ph. Korneev, and M. Murakami, `Laser-driven electrodynamic implosion of fast ions in a thin shell`, - `Matter Radiat. Extremes 8, 056602 (2023) `_ + `Matter and Radiation at Extremes 8, 056602 (2023) `_ .. [RezaeiPandari2023] @@ -271,19 +276,19 @@ Following is the distribution of these topics in the listed publications up to N J. Jonnerby, A. von Boetticher, J. Holloway, L. Corner, A. Picksley, A. J. Ross, R. J. Shalloo , C. Thornton, N. Bourgeois, R. Walczak, and S. M. Hooker, `Measurement of the decay of laser-driven linear plasma wakefields`, - `Phys. Rev. E 108, 055211 (2023) `_ + `Physical Review E 108, 055211 (2023) `_ .. [Drobniak2023] P. Drobniak, E. Baynard, C. Bruni, K. Cassou, C. Guyot, G. Kane, S. Kazamias, V. Kubytskyi, N. Lericheux, B. Lucas, M. Pittman, F. Massimo, A. Beck, A. Specka, P. Nghiem, and D. Minenna, `Random scan optimization of a laser-plasma electron injector based on fast particle-in-cell simulations`, - `Phys. Rev. Accel. Beams 26, 091302 (2023) `_ + `Physical Review Accelerators and Beams 26, 091302 (2023) `_ .. [Bukharskii2023] N. Bukharskii and Ph. Korneev, `Intense widely controlled terahertz radiation from laser-driven wires`, - `Matter Radiat. Extremes 8, 044401 (2023) `_ + `Matter and Radiation at Extremes 8, 044401 (2023) `_ .. [Schmitz2023] @@ -307,7 +312,7 @@ Following is the distribution of these topics in the listed publications up to N X. Gao, `Ionization dynamics of sub-micrometer-sized clusters in intense ultrafast laser pulses`, - `Phys. Plasmas 30, 052102 (2023) `_ + `Physics of Plasmas 30, 052102 (2023) `_ .. [Krafft2023] @@ -325,7 +330,7 @@ Following is the distribution of these topics in the listed publications up to N A. Ghizzo, D. Del Sarto, and H. Betar, `Collisionless Heating Driven by Vlasov Filamentation in a Counterstreaming Beams Configuration`, - `Phys. Rev. Lett. 131, 035101 (2023) `_ + `Physical Review Letters 131, 035101 (2023) `_ .. [Yang2023] @@ -337,31 +342,31 @@ Following is the distribution of these topics in the listed publications up to N W. Yao, A. Fazzini, S.N. Chen, K. Burdonov, J. Béard, M. Borghesi, A. Ciardi, M. Miceli, S. Orlando, X. Ribeyre, E. d'Humières and J. Fuchs, `Investigating particle acceleration dynamics in interpenetrating magnetized collisionless super-critical shocks`, - `J. Plasma Phys. 89, 915890101 (2023) `_ + `Journal of Plasma Physics 89, 915890101 (2023) `_ .. [Pak2023] T. Pak, M. Rezaei-Pandari, S. B. Kim, G. Lee, D. H. Wi, C. I. Hojbota, M. Mirzaie, H. Kim, J. H. Sung, S. K. Lee, C. Kang and K.-Y. Kim, `Multi-millijoule terahertz emission from laser-wakefield-accelerated electrons`, - `Light Sci Appl 12, 37 (2023) `_ + `Light: Science and Applications 12, 37 (2023) `_ .. [Istokskaia2023] V. Istokskaia, M. Tosca, L. Giuffrida, J. Psikal, F. Grepl, V. Kantarelou, S. Stancek, S. Di Siena, A. Hadjikyriacou, A. McIlvenny, Y. Levy, J. Huynh, M. Cimrman, P. Pleskunov, D. Nikitin, A. Choukourov, F. Belloni, A. Picciotto, S. Kar, M. Borghesi, A. Lucianetti, T. Mocek and D. Margarone, `A multi-MeV alpha particle source via proton-boron fusion driven by a 10-GW tabletop laser`, - `Commun Phys 6, 27 (2023) `_ + `Communications Physics 6, 27 (2023) `_ .. [Yoon2023] Y. D. Yoon, D. E. Wendel and G. S. Yun, `Equilibrium selection via current sheet relaxation and guide field amplification`, - `Nat Commun 14, 139 (2023) `_ + `Nature Communications 14, 139 (2023) `_ .. [Galbiati2023] M. Galbiati, A. Formenti, M. Grech and M. Passoni, `Numerical investigation of non-linear inverse Compton scattering in double-layer targets`, - `Front. Phys. 11, fphy.2023.1117543 (2023) `_ + `Frontiers in Physics 11, fphy.2023.1117543 (2023) `_ .. [Sakai2023] @@ -373,7 +378,7 @@ Following is the distribution of these topics in the listed publications up to N A. Golovanov, I. Yu. Kostyukov, A. Pukhov and V. Malka, `Energy-Conserving Theory of the Blowout Regime of Plasma Wakefield`, - `Phys. Rev. Lett. 130, 105001 (2023) `_ + `Physical Review Letters 130, 105001 (2023) `_ .. [Miethlinger2023] @@ -385,13 +390,13 @@ Following is the distribution of these topics in the listed publications up to N C. Zepter, A. Seidel, M. Zepf, M. C. Kaluza and A. Sävert, `Role of spatiotemporal couplings in stimulated Raman side scattering`, - `Phys. Rev. Research 5, L012023 (2023) `_ + `Physical Review Research 5, L012023 (2023) `_ .. [Marini2023] S. Marini, M. Grech, P. S. Kleij, M. Raynaud and C. Riconda, `Electron acceleration by laser plasma wedge interaction`, - `Phys. Rev. Research 5, 013115 (2023) `_ + `Physical Review Research 5, 013115 (2023) `_ .. [Blackman2022] @@ -451,7 +456,7 @@ Following is the distribution of these topics in the listed publications up to N D. Margarone, J. Bonvalet, L. Giuffrida, A. Morace, V. Kantarelou, M. Tosca, D. Raffestin, P. Nicolai, A. Picciotto, Y. Abe, Y. Arikawa, S. Fujioka, Y. Fukuda, Y. Kuramitsu, H. Habara and D. Batani, `In-Target Proton–Boron Nuclear Fusion Using a PW-Class Laser`, - `Appl. Sci. 12(3), 1444 (2022) `_ + `Appled Sciences 12(3), 1444 (2022) `_ .. [Kochetkov2022] @@ -463,13 +468,13 @@ Following is the distribution of these topics in the listed publications up to N A. Oudin, A. Debayle, C. Ruyer, D. Benisti, `Cross-beam energy transfer between spatially smoothed laser beams`, - `Phys. Plasmas 29, 112112 (2022) `_ + `Physics of Plasmas 29, 112112 (2022) `_ .. [Chen2022] Q. Chen, D. Maslarova, J. Wang, S. Li, and D. Umstadter, `Injection of electron beams into two laser wakefields and generation of electron rings`, - `Phys. Rev. E 106, 055202 (2022) `_ + `Physical Review E 106, 055202 (2022) `_ .. [Kumar2022b] @@ -481,7 +486,7 @@ Following is the distribution of these topics in the listed publications up to N S. Kumar, D. K. Singh and H. K. Malik, `Comparative study of ultrashort single-pulse and multi-pulse driven laser wakefield acceleration`, - `Laser Phys. Lett. 20, 026001 (2022) `_ + `Laser Physics Letters 20, 026001 (2022) `_ .. [Miloshevsky2022] @@ -505,25 +510,25 @@ Following is the distribution of these topics in the listed publications up to N I. Ouatu, B. T. Spiers, R. Aboushelbaya, Q. Feng, M. W. von der Leyen, R. W. Paddock, R. Timmis, C. Ticos, K. M. Krushelnick and P. A. Norreys, `Ionization states for the multipetawatt laser-QED regime`, - `Phys. Rev. E 106, 015205 (2022) `_ + `Physical Review E 106, 015205 (2022) `_ .. [Beth2022] A. Beth, H. Gunell, C. Simon Wedlund, C. Goetz, H. Nilsson and M. Hamrin, `First investigation of the diamagnetic cavity boundary layer with a 1D3V PIC simulation`, - `A&A 667, A143 (2022) `_ + `Astronomy & Astrophysics 667, A143 (2022) `_ .. [Guo2022] Y. Guo, X. Geng, L. Ji, B. Shen and R. Li, `Improving the accuracy of hard photon emission by sigmoid sampling of the quantum-electrodynamic table in particle-in-cell Monte Carlo simulations`, - `Phys. Rev. E 105, 025309 (2022) `_ + `Physical Review E 105, 025309 (2022) `_ .. [Pae2022] K. . Pae, C. M. Kim, V. B. Pathak, C.-M. Ryu and C. H. Nam, `Direct laser acceleration of electrons from a plasma mirror by an intense few-cycle Laguerre–Gaussian laser and its dependence on the carrier-envelope phase`, - `Plasma Phys. Control. Fusion 64, 055013 (2022) `_ + `Plasma Physics and Controlled Fusion 64, 055013 (2022) `_ .. [Zhang2022a] @@ -536,43 +541,43 @@ Following is the distribution of these topics in the listed publications up to N Q. Han, X. Geng, B. Shen, Z. Xu and L. Ji, `Ultra-fast polarization of a thin electron layer in the rotational standing-wave field driven by double ultra-intense laser pulses`, - `New J. Phys. 24, 063013 (2022) `_ + `New Journal of Physics 24, 063013 (2022) `_ .. [Gothel2022] I. Göthel, C. Bernert, M. Bussmann, M. Garten, T. Miethlinger, M. Rehwald, K. Zeil, T. Ziegler, T. E. Cowan, U. Schramm and T. Kluge, `Optimized laser ion acceleration at the relativistic critical density surface`, - `Plasma Phys. Control. Fusion 64, 044010 (2022) `_ + `Plasma Physics and Controlled Fusion 64, 044010 (2022) `_ .. [Fazzini2022] A. Fazzini, W. Yao, K. Burdonov, J. Béard, S. N. Chen, A. Ciardi, E. d’Humières, R. Diab, E. D. Filippov, S. Kisyov, V. Lelasseux, M. Miceli, Q. Moreno, S. Orlando, S. Pikuz, X. Ribeyre, M. Starodubtsev, R. Zemskov and J. Fuchs, `Particle energization in colliding subcritical collisionless shocks investigated in the laboratory`, - `A&A 665, A87 (2022) `_ + `Astronomy & Astrophysics 665, A87 (2022) `_ .. [Bykov2022] A. M. Bykov, S. M. Osipov and V. I. Romanskii, `Acceleration of Cosmic Rays to Energies above 1015 eV by Transrelativistic Shocks`, - `J. Exp. Theor. Phys. 134, 487-497 (2022) `_ + `Journal of Experimental and Theoretical Physics 134, 487-497 (2022) `_ .. [Sundstrom2022] A. Sundström, M. Grech, I. Pusztai and C. Riconda, `Stimulated-Raman-scattering amplification of attosecond XUV pulses with pulse-train pumps and application to local in-depth plasma-density measurement`, - `Phys. Rev. E 106, 045208 (2022) `_ + `Physical Review E 106, 045208 (2022) `_ .. [Krafft2022b] C. Krafft and P. Savoini, `Third and Fourth Harmonics of Electromagnetic Emissions by a Weak Beam in a Solar Wind Plasma with Random Density Fluctuations`, - `ApJL 934, L28 (2022) `_ + `The Astrophysical Journal Letters 934, L28 (2022) `_ .. [Krafft2022a] C. Krafft and P. Savoini, `Fundamental Electromagnetic Emissions by a Weak Electron Beam in Solar Wind Plasmas with Density Fluctuations`, - `ApJL 924, L24 (2022) `_ + `The Astrophysical Journal Letters 924, L24 (2022) `_ .. [Kong2022] @@ -584,7 +589,7 @@ Following is the distribution of these topics in the listed publications up to N C. Davidson, Z.-M. Sheng, T. Wilson and P. McKenna, `Theoretical and computational studies of the Weibel instability in several beam–plasma interaction configurations`, - `J. Plasma Phys. 88, 905880206 (2022) `_ + `Journal of Plasma Physics 88, 905880206 (2022) `_ .. [Glek2022] @@ -596,7 +601,7 @@ Following is the distribution of these topics in the listed publications up to N D. Umstadter `Controlled Injection of Electrons for Improved Performance of Laser-Wakefield Acceleration`, - `United States: N. p., (2022) `_ + `United States Department of Energy Technical Report (2022) `_ .. [Massimo2022] @@ -615,7 +620,7 @@ Following is the distribution of these topics in the listed publications up to N P. K. Singh, F.-Y. Li, C.-K. Huang, A. Moreau, R. Hollinger, A. Junghans, A. Favalli, C. Calvi, S. Wang, Y. Wang, H. Song, J. J. Rocca, R. E. Reinovsky and S. Palaniyappan, `Vacuum laser acceleration of super-ponderomotive electrons using relativistic transparency injection`, - `Nat Commun 13, 54 (2022) `_ + `Nature Communications 13, 54 (2022) `_ .. [Lobet2022] @@ -646,13 +651,13 @@ Following is the distribution of these topics in the listed publications up to N P. Tomassini, F. Massimo, L. Labate and L. A. Gizzi, `Accurate electron beam phase-space theory for ionization-injection schemes driven by laser pulses`, - `High Pow Laser Sci Eng 10, e15 (2021) `_ + `High Power Laser Science and Engineering 10, e15 (2021) `_ .. [Meinhold2021] T. A. Meinhold and N. Kumar, `Radiation pressure acceleration of protons from structured thin-foil targets`, - `J. Plasma Phys. 87, 905870607 (2021) `_ + `Journal of Plasma Physics 87, 905870607 (2021) `_ .. [Bonvalet2021b] @@ -664,13 +669,13 @@ Following is the distribution of these topics in the listed publications up to N Y. Shi, D. R. Blackman and A. Arefiev, `Electron acceleration using twisted laser wavefronts`, - `Plasma Phys. Control. Fusion 63, 125032 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 125032 (2021) `_ .. [Kumar2021] N. Kumar and B. Reville, `Nonthermal Particle Acceleration at Highly Oblique Nonrelativistic Shocks`, - `ApJL 921, L14 (2021) `_ + `The Astrophysical Journal Letters 921, L14 (2021) `_ .. [Ghaith2021] @@ -682,13 +687,13 @@ Following is the distribution of these topics in the listed publications up to N V. Horný and L. Veisz, `Generation of single attosecond relativistic electron bunch from intense laser interaction with a nanosphere`, - `Plasma Phys. Control. Fusion 63, 125025 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 125025 (2021) `_ .. [Krafft2021] C. Krafft and P. Savoini, `Second Harmonic Electromagnetic Emissions by an Electron Beam in Solar Wind Plasmas with Density Fluctuations`, - `ApJL 917, L23 (2021) `_ + `The Astrophysical Journal Letters 917, L23 (2021) `_ .. [Khalilzadeh2021c] @@ -712,7 +717,7 @@ Following is the distribution of these topics in the listed publications up to N Y. Shou, D. Wang, P. Wang, J. Liu, Z. Cao, Z. Mei, S. Xu, Z. Pan, D. Kong, G. Qi, Z. Liu, Y. Liang, Z. Peng, Y. Gao, S. Chen, J. Zhao, Y. Zhao, H. Xu, J. Zhao, Y. Wu, X. Yan and W. Ma, `High-efficiency generation of narrowband soft x rays from carbon nanotube foams irradiated by relativistic femtosecond lasers`, - `Opt. Lett. 46, 3969 (2021) `_ + `Optics Letters 46, 3969 (2021) `_ .. [Khalilzadeh2021b] @@ -724,67 +729,67 @@ Following is the distribution of these topics in the listed publications up to N H. Hosseinkhani, M. Pishdast, J. Yazdanpanah and S. A. Ghasemi, `Investigation of the classical and quantum radiation reaction effect on interaction of ultra high power laser with near critical plasma`, - `J. Nuclear Sci. Technol. 42, 27-35 (2021) `_ + `Journal of Nuclear Science, Engineering and Technology 42, 27-35 (2021) `_ .. [MercuriBaron2021] A. Mercuri-Baron, M. Grech, F. Niel, A. Grassi, M. Lobet, A. Di Piazza and C. Riconda, `Impact of the laser spatio-temporal shape on Breit–Wheeler pair production`, - `New J. Phys. 23, 085006 (2021) `_ + `New Journal of Physics 23, 085006 (2021) `_ .. [Peng2021] H. Peng, C. Riconda, S. Weber, C.T. Zhou and S.C. Ruan, `Frequency Conversion of Lasers in a Dynamic Plasma Grating`, - `Phys. Rev. Applied 15, 054053 (2021) `_ + `Physical Review Applied 15, 054053 (2021) `_ .. [Shi2021a] Y. Shi, D. Blackman, D. Stutman and A. Arefiev, `Generation of Ultrarelativistic Monoenergetic Electron Bunches via a Synergistic Interaction of Longitudinal Electric and Magnetic Fields of a Twisted Laser`, - `Phys. Rev. Lett. 126, 234801 (2021) `_ + `Physical Review Letters 126, 234801 (2021) `_ .. [Bonvalet2021a] J. Bonvalet, Ph. Nicolaï, D. Raffestin, E. D'humieres, D. Batani, V. Tikhonchuk, V. Kantarelou, L. Giuffrida, M. Tosca, G. Korn, A. Picciotto, A. Morace, Y. Abe, Y. Arikawa, S. Fujioka, Y. Fukuda, Y. Kuramitsu, H. Habara and D. Margarone, `Energetic α-particle sources produced through proton-boron reactions by high-energy high-intensity laser beams`, - `Phys. Rev. E 103, 053202 (2021) `_ + `Physical Review E 103, 053202 (2021) `_ .. [Shekhanov2021] S. A. Shekhanov and V. T. Tikhonchuk, `SRS-SBS competition and nonlinear laser energy absorption in a high temperature plasma`, - `Plasma Phys. Control. Fusion 63, 115016 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 115016 (2021) `_ .. [Psikal2021] J Psikal, `Laser-driven ion acceleration from near-critical Gaussian plasma density profile`, - `Plasma Phys. Control. Fusion 63, 064002 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 064002 (2021) `_ .. [Yoon2021b] Y. D. Yoon, G. S. Yun, D. E. Wendel and J. L. Burch, `Collisionless relaxation of a disequilibrated current sheet and implications for bifurcated structures`, - `Nat Commun 12, 3774 (2021) `_ + `Nature Communications 12, 3774 (2021) `_ .. [Lavorenti2021] F. Lavorenti, P. Henri, F. Califano, S. Aizawa and N. André, `Electron acceleration driven by the lower-hybrid-drift instability. An extended quasilinear model`, - `A&A 652, 202141049 (2021) `_ + `Astronomy & Astrophysics 652, 202141049 (2021) `_ .. [Golovanov2021] A A Golovanov, I Yu Kostyukov, L Reichwein, J Thomas and A Pukhov, `Excitation of strongly nonlinear plasma wakefield by electron bunches`, - `Plasma Phys. Control. Fusion 63, 085004 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 085004 (2021) `_ .. [Jirka2021] M. Jirka, P. Sasorov, S. S. Bulanov, G. Korn, B. Rus and S. V. Bulanov, `Reaching high laser intensity by a radiating electron`, - `Phys. Rev. A 103, 053114 (2021) `_ + `Physical Review A 103, 053114 (2021) `_ .. [Marques2021] @@ -814,7 +819,7 @@ Following is the distribution of these topics in the listed publications up to N G. Cantono, A. Permogorov, J. Ferri, E. Smetanina, A. Dmitriev, A. Persson, T. Fülöp and C.-G. Wahlström, `Laser-driven proton acceleration from ultrathin foils with nanoholes`, - `Sci Rep 11, 5006 (2021) `_ + `Scientific Reports 11, 5006 (2021) `_ .. [Perez2021] @@ -832,13 +837,13 @@ Following is the distribution of these topics in the listed publications up to N A. Sampath, X. Davoine, S. Corde, L. Gremillet, M. Gilljohann, M. Sangal, C. H. Keitel, R. Ariniello, J. Cary, H. Ekerfelt, C. Emma, F. Fiuza, H. Fujii, M. Hogan, C. Joshi, A. Knetsch, O. Kononenko, V. Lee, M. Litos, K. Marsh, Z. Nie, B. O’Shea, J. R. Peterson, P. San Miguel Claveria, D. Storey, Y. Wu, X. Xu, C. Zhang and M. Tamburini, `Extremely Dense Gamma-Ray Pulses in Electron Beam-Multifoil Collisions`, - `Phys. Rev. Lett. 126, 064801 (2021) `_ + `Physical Review Letters 126, 064801 (2021) `_ .. [Marini2021a] S. Marini, P. S. Kleij, F. Pisani, F. Amiranoff, M. Grech, A. Macchi, M. Raynaud and C. Riconda, `Ultrashort high energy electron bunches from tunable surface plasma waves driven with laser wavefront rotation`, - `Phys. Rev. E 103, L021201 (2021) `_ + `Physical Review E 103, L021201 (2021) `_ .. [Yao2021] @@ -850,14 +855,14 @@ Following is the distribution of these topics in the listed publications up to N E. G. Gelfer, A. M, Fedotov and S. Weber, `Radiation induced acceleration of ions in a laser irradiated transparent foil`, - `New J. Phys. 23, 095002 (2021) `_ + `New Journal of Physics 23, 095002 (2021) `_ `arXiv:1907.02621 `_ .. [Siminos2021] E. Siminos, I. Thiele and C. Olofsson, `Laser Wakefield Driven Generation of Isolated Carrier-Envelope-Phase Tunable Intense Subcycle Pulses`, - `Phys. Rev. Lett. 126, 044801 (2021) `_ + `Physical Review Letters 126, 044801 (2021) `_ `arXiv:1902.05014 `_ .. [Budriga2020] @@ -870,13 +875,13 @@ Following is the distribution of these topics in the listed publications up to N P. A. P. Nghiem, R. Assmann, A. Beck et al., `Toward a plasma-based accelerator at high beam energy with high beam charge and high beam quality`, - `Phys. Rev. Accel. Beams 23, 031301 (2020) `_ + `Physical Review Accelerators and Beams 23, 031301 (2020) `_ .. [Pisarczyk2020] T. Pisarczyk, M. Kalal, S. Yu. Gus'kov et al., `Hot electron retention in laser plasma created under terawatt subnanosecond irradiation of Cu targets`, - `Plasma Phys. Control. Fusion 62, 115020 (2020) `_ + `Plasma Physics and Controlled Fusion 62, 115020 (2020) `_ .. [Pagano2020] @@ -894,25 +899,25 @@ Following is the distribution of these topics in the listed publications up to N H. Peng, C. Riconda, M. Grech, C.-T. Zhou and S. Weber, `Dynamical aspects of plasma gratings driven by a static ponderomotive potential`, - `Plasma Phys. Control. Fusion 62, 115015 (2020) `_ + `Plasma Physics and Controlled Fusion 62, 115015 (2020) `_ .. [Glek2020] P. B. Glek, A. A. Voronin, V. Ya. Panchenko and A. M. Zheltikov, `Relativistic electron bunches locked to attosecond optical field waveforms: an attosecond light–matter bound state`, - `Laser Phys. Lett. 17 055401 (2020) `_ + `Laser Physics Letters 17 055401 (2020) `_ .. [Margarone2020] D. Margarone, A. Morace, J. Bonvalet et al., `Generation of α-Particle Beams With a Multi-kJ, Peta-Watt Class Laser System`, - `Front. Phys. 8, 343 (2020) `_ + `Frontiers in Physics 8, 343 (2020) `_ .. [Sinha2020] U. Sinha and N. Kumar, `Pair-beam propagation in a magnetized plasma for modeling the polarized radiation emission from gamma-ray bursts in laboratory astrophysics experiments`, - `Phys. Rev. E 101, 063204 (2020) `_ + `Physical Review E 101, 063204 (2020) `_ .. [Mitrofanov2020] @@ -924,81 +929,81 @@ Following is the distribution of these topics in the listed publications up to N B. T. Spiers, M. P. Hill, C. Brown, L. Ceurvorst, N. Ratan, A. F. Savin, P. Allan, E. Floyd, J. Fyrth, L. Hobbs, S. James, J. Luis, M. Ramsay, N. Sircombe, J. Skidmore, R. Aboushelbaya, M. W. Mayr, R. Paddock, R. H. W. Wang and P. A. Norreys, `Whole-beam self-focusing in fusion-relevant plasma`, - `Phil. Trans. R. Soc. A379, 20200159 `_ + `Philosophical Transactions of the Royal Society A379, 20200159 `_ .. [Derouillat2020] J. Derouillat and A. Beck, `Single Domain Multiple Decompositions for Particle-in-Cell simulations`, - `J. Phys.: Conf. Ser. 1596, 012052 (2020) `_ + `Journal of Physics: Conference Series 1596, 012052 (2020) `_ `arXiv:1912.04064 `_ .. [Zemzemi2020] I. Zemzemi, F. Massimo and A. Beck, `Azimuthal decomposition study of a realistic laser profile for efficient modeling of Laser WakeField Acceleration`, - `J. Phys.: Conf. Ser. 1596, 012055 (2020) `_ + `Journal of Physics: Conference Series 1596, 012055 (2020) `_ .. [Massimo2020b] F. Massimo, I. Zemzemi, A. Beck, J. Derouillat and A. Specka, `Efficient cylindrical envelope modeling for laser wakefield acceleration`, - `J. Phys.: Conf. Ser. 1596, 012054 (2020) `_ + `Journal of Physics: Conference Series 1596, 012054 (2020) `_ `arXiv:1912.04674 `_ .. [Massimo2020a] F. Massimo, A. Beck, J. Derouillat, I. Zemzemi and A. Specka, `Numerical modeling of laser tunneling ionization in particle-in-cell codes with a laser envelope model`, - `Phys. Rev. E 102, 033204 (2020) `_ + `Physical Review E 102, 033204 (2020) `_ `arXiv:2006.04433 `_ .. [Marcowith2020] A. Marcowith, G. Ferrand, M. Grech, Z. Meliani, I. Plotnikov and R. Walder, `Multi-scale simulations of particle acceleration in astrophysical systems`, - `Living Rev Comput Astrophys 6, 1 (2020) `_ + `Living Reviews in Computational Astrophysics 6, 1 (2020) `_ `arXiv:2002.09411 `_ .. [Dargent2020] J. Dargent, N. Aunai, B. Lavraud, S. Toledo‐Redondo and F. Califano, `Simulation of Plasmaspheric Plume Impact on Dayside Magnetic Reconnection`, - `Geophys. Res. Lett. 47, 2019GL086546 (2020) `_ + `Geophysical Research Letters 47, 2019GL086546 (2020) `_ `arXiv:2002.02243 `_ .. [Sundström2020b] A. Sundström, L. Gremillet, E. Siminos and I. Pusztai, `Collisional effects on the electrostatic shock dynamics in thin-foil targets driven by an ultraintense short pulse laser`, - `Plasma Phys. Control. Fusion 62, 085015 (2020) `_ + `Plasma Physics and Controlled Fusion 62, 085015 (2020) `_ .. [Sundström2020a] A. Sundström, L. Gremillet, E. Siminos and I. Pusztai, `Fast collisional electron heating and relaxation in thin foils driven by a circularly polarized ultraintense short-pulse laser`, - `J. Plasma Phys. 86, 755860201 (2020) `_ + `Journal of Plasma Physics 86, 755860201 (2020) `_ `arXiv:1911.09562 `_ .. [Gelfer2020] E. G. Gelfer, A. M. Fedotov, O. Klimo and S. Weber, `Absorption and opacity threshold for a thin foil in a strong circularly polarized laser field`, - `Phys. Rev. E 101, 033204 (2020) `_ + `Physical Review E 101, 033204 (2020) `_ `arXiv:1906.05902 `_ .. [Ferri2020] J. Ferri, I. Thiele, E. Siminos, L. Gremillet, E. Smetanina, A. Dmitriev, G. Cantono, C.-G. Wahlström and T. Fülöp, `Enhancement of laser-driven ion acceleration in non-periodic nanostructured targets`, - `J. Plasma Phys. 86, 905860101 (2020) `_ + `Journal of Plasma Physics 86, 905860101 (2020) `_ `arXiv:1905.11131 `_ .. [Marques2019] J.-R. Marquès, L. Lancia, T. Gangolf, M. Blecher, S. Bolaños, J. Fuchs, O. Willi, F. Amiranoff, R. L. Berger, M. Chiaramello, S. Weber, and C. Riconda, `Joule-Level High-Efficiency Energy Transfer to Subpicosecond Laser Pulses by a Plasma-Based Amplifier`, - `Phys. Rev. X 9, 021008 (2019) `_ + `Physical Review X 9, 021008 (2019) `_ .. [Plotnikov2019] I. Plotnikov and L. Sironi, @@ -1021,39 +1026,39 @@ Following is the distribution of these topics in the listed publications up to N X. S. Geng, L. L. Ji, B. F. Shen et al., `Quantum reflection above the classical radiation-reaction barrier in the quantum electro-dynamics regime`, - `Commun. Phys. 2, 66 (2019) `_ + `Communications Physics 2, 66 (2019) `_ .. [Sinha2019] U. Sinha, C. H. Keitel, and N. Kumar, `Polarized Light from the Transportation of a Matter-Antimatter Beam in a Plasma`, - `Phys. Rev. Lett. 122, 204801 (2019) `_ + `Physical Review Letters 122, 204801 (2019) `_ .. [Malko2019] S. Malko, X. Vaisseau, F. Perez, D. Batani, A. Curcio, M. Ehret, J. Honrubia, K. Jakubowska, A. Morace, J. J. Santos and L. Volpe, `Enhanced relativistic-electron beam collimation using two consecutive laser pulses`, - `Sci Rep 9, 14061 (2019) `_ + `Scientific Reports 9, 14061 (2019) `_ .. [Peng2019] H. Peng, C. Riconda, M. Grech, J.-Q. Su and S. Weber, `Nonlinear dynamics of laser-generated ion-plasma gratings: A unified description`, - `Phys. Rev. E 100, 061201 (2019) `_ + `Physical Review E 100, 061201 (2019) `_ `arXiv:1911.03440 `_ .. [Fang2019] J. Fang, C.-Y. Lu, J.-W. Yan and H. Yu, `Early acceleration of electrons and protons at the nonrelativistic quasiparallel shocks with different obliquity angles`, - `Res. Astron. Astrophys. 19, 182 (2019) `_ + `Research in Astronomy and Astrophysics 19, 182 (2019) `_ `arXiv:1908.08170 `_ .. [Yoon2019b] Y. Yoon and P. M. Bellan, `Kinetic Verification of the Stochastic Ion Heating Mechanism in Collisionless Magnetic Reconnection`, - `ApJ 887, L29 (2019) `_ + `The Astrophysical Journal Letters 887, L29 (2019) `_ .. [Yoon2019a] @@ -1065,7 +1070,7 @@ Following is the distribution of these topics in the listed publications up to N F. Massimo, A. Beck, J. Derouillat, M. Grech, M. Lobet, F. Pérez, I. Zemzemi and A Specka, `Efficient start-to-end 3D envelope modeling for two-stage laser wakefield acceleration experiments`, - `Plasma Phys. Control. Fusion 61, 124001 (2019) `_ + `Plasma Physics and Controlled Fusion 61, 124001 (2019) `_ `arXiv:1912.04127 `_ .. [Beck2019] @@ -1079,14 +1084,14 @@ Following is the distribution of these topics in the listed publications up to N F. Pérez and M. Grech, `Oblique-incidence, arbitrary-profile wave injection for electromagnetic simulations`, - `Phys. Rev. E 99, 033307 (2019) `_ + `Physical Review E 99, 033307 (2019) `_ `arXiv:1809.04435 `_ .. [Thiele2019] I. Thiele, E. Siminos and T. Fülöp, `Electron Beam Driven Generation of Frequency-Tunable Isolated Relativistic Subcycle Pulses`, - `Phys. Rev. Lett. 122, 104803 (2019) `_ + `Physical Review Letters 122, 104803 (2019) `_ `arXiv:1806.04976 `_ .. [Golovanov2018] @@ -1099,19 +1104,19 @@ Following is the distribution of these topics in the listed publications up to N S. Toledo-Redondo, J. Dargent, N. Aunai, B. Lavraud, M. André, W. Li, B. Giles, P.-A. Lindvist, R. E. Ergun, C. T. Russel and J. L. Burch, `Perpendicular Current Reduction Caused by Cold Ions of Ionospheric Origin in Magnetic Reconnection at the Magnetopause: Particle-in-Cell Simulations and Spacecraft Observations`, - `Geophys. Res. Lett. 45, 10,033 (2018) `_ + `Geophysical Research Letters 45, 10,033 (2018) `_ .. [Gelfer2018] E. Gelfer, N. Elkina and A. Fedotov, `Unexpected impact of radiation friction: enhancing production of longitudinal plasma waves`, - `Sci. Rep. 8, 6478 (2018) `_ + `Scientific Reports 8, 6478 (2018) `_ .. [Niel2018b] F. Niel, C. Riconda, F. Amiranoff, M. Lobet, J. Derouillat, F. Pérez, T. Vinci and M. Grech, `From quantum to classical modeling of radiation reaction: a focus on the radiation spectrum`, - `Plasma Phys. Control. Fusion 60, 094002 (2018) `_ + `Plasma Physics and Controlled Fusion 60, 094002 (2018) `_ `arXiv:1802.02927 `_ .. [Plotnikov2018] @@ -1125,21 +1130,21 @@ Following is the distribution of these topics in the listed publications up to N F. Niel, C. Riconda, F. Amiranoff, R. Duclous and M. Grech, `From quantum to classical modeling of radiation reaction: A focus on stochasticity effects`, - `Phys. Rev. E 97, 043209 (2018) `_ + `Physical Review E 97, 043209 (2018) `_ `arXiv:1707.02618 `_ .. [Grassi2017b] A. Grassi, M. Grech, F. Amiranoff, A. Macchi and C. Riconda, `Radiation-pressure-driven ion Weibel instability and collisionless shocks`, - `Phys. Rev. E 96, 033204 (2017) `_ + `Physical Review E 96, 033204 (2017) `_ `arXiv:1705.05402 `_ .. [Fedeli2017] L. Fedeli, A. Formenti, L. Cialfi, A. Sgattoni, G. Cantono and M. Passoni, `Structured targets for advanced laser-driven sources`, - `Plasma Phys. Control. Fusion 60, 014013 (2017) `_ + `Plasma Physics and Controlled Fusion 60, 014013 (2017) `_ .. [Golovanov2017] @@ -1151,19 +1156,19 @@ Following is the distribution of these topics in the listed publications up to N J. Dargent, N. Aunai, B. Lavraud, S. Toledo-Redondo, M. A. Shay, P. A. Cassak and K. Malakit, `Kinetic simulation of asymmetric magnetic reconnection with cold ions`, - `J. Geophys. Res. Space Physics 122, 5290-5306 (2017) `_ + `Journal of Geophysical Research: Space Physics 122, 5290-5306 (2017) `_ .. [Grassi2017a] A. Grassi, M. Grech, F. Amiranoff, F. Pegoraro, A. Macchi and C. Riconda, `Electron Weibel instability in relativistic counterstreaming plasmas with flow-aligned external magnetic fields`, - `Phys. Rev. E 95, 023203 (2017) `_ + `Physical Review E 95, 023203 (2017) `_ .. [Dargent2016] J. Dargent, N. Aunai, G. Belmont, N. Dorville, B. Lavraud and M. Hesse, `Full particle-in-cell simulations of kinetic equilibria and the role of the initial current sheet on steady asymmetric magnetic reconnection`, - `J. Plasma Phys. 82, 905820305 (2016) `_ + `Journal of Plasma Physics 82, 905820305 (2016) `_ .. [Chiaramello2016] @@ -1175,10 +1180,10 @@ Following is the distribution of these topics in the listed publications up to N A. Beck, J.T. Frederiksen and J. Dérouillat, `Load management strategy for Particle-In-Cell simulations in high energy particle acceleration`, - `Nucl. Inst. Meth. in Phys. Res. A 829, 418-421 (2016) `_ + `Nuclear Instuments and Methods in Physics Research A 829, 418-421 (2016) `_ .. [Lancia2016] L. Lancia, A. Giribono, L. Vassura, M. Chiaramello, C. Riconda, S. Weber, A. Castan, A. Chatelain, A. Frank, T. Gangolf, M. N. Quinn, J. Fuchs and J.-R. Marquès, `Signatures of the Self-Similar Regime of Strongly Coupled Stimulated Brillouin Scattering for Efficient Short Laser Pulse Amplification`, - `Phys. Rev. Lett. 116, 075001 (2016) `_ + `Physical Review Letters 116, 075001 (2016) `_ From 097422756966fccdf0630bb0c80e5a01d7c319a2 Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Tue, 21 May 2024 09:33:43 +0200 Subject: [PATCH 28/54] add publication --- doc/Sphinx/Overview/material.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/Sphinx/Overview/material.rst b/doc/Sphinx/Overview/material.rst index 2d33d6aff..33184146f 100644 --- a/doc/Sphinx/Overview/material.rst +++ b/doc/Sphinx/Overview/material.rst @@ -30,7 +30,7 @@ Papers involving Smilei ^^^^^^^^^^^^^^^^^^^^^^^^ Only papers published in peer-reviewed journals are listed (for the complete list of citing papers see `Google Scholar `_). -As of May 2024, 187 papers have been published covering a broad range of topics: +As of May 2024, 188 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -50,6 +50,12 @@ Following is the distribution of these topics in the listed publications up to N Use the python script doc/doi2publications.py to generate entries from a DOI number, and paste them here You can count the number of papers in the list with the vim command :%s/.. \[//gn. +.. [Salgado2024] + + F. C. Salgado, A. Kozan, D. Seipt, D. Hollatz, P. Hilz, M. Kaluza, A. Sävert, A. Seidel, D. Ullmann, Y. Zhao, and M. Zepf, + `All-optical source size and emittance measurements of laser-accelerated electron beams`, + `Physical Review Accelerators and Beams 27, 052803 (2024) `_ + .. [Ivanov2024] K. A. Ivanov, D. A. Gorlova, I. N. Tsymbalov, I. P. Tsygvintsev, S. A. Shulyapov, R. V. Volkov, and A. B. Savel’ev, From 443a62534c3c8bda2ba88a524f74faf82bfed2b5 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Wed, 22 May 2024 15:31:25 +0200 Subject: [PATCH 29/54] fix for AMD --- src/Particles/Particles.cpp | 2 +- src/Particles/Particles.h | 2 +- src/Particles/nvidiaParticles.cu | 115 +++++++++++++++++-------------- src/Particles/nvidiaParticles.h | 34 ++------- src/Species/Species.cpp | 5 +- src/Tools/gpu.h | 4 ++ 6 files changed, 75 insertions(+), 87 deletions(-) diff --git a/src/Particles/Particles.cpp b/src/Particles/Particles.cpp index d4eea30e9..8285762c6 100755 --- a/src/Particles/Particles.cpp +++ b/src/Particles/Particles.cpp @@ -1398,7 +1398,7 @@ int Particles::eraseLeavingParticles() return 0; } -void Particles::copyParticles( Particles* particles_to_inject ) +int Particles::addParticles( Particles* particles_to_inject ) { ERROR( "Device only feature, should not have come here! On CPU it's done in sortParticles." ); } diff --git a/src/Particles/Particles.h b/src/Particles/Particles.h index 91689ef3f..20b9c2ea6 100755 --- a/src/Particles/Particles.h +++ b/src/Particles/Particles.h @@ -485,7 +485,7 @@ class Particles // ----------------------------------------------------------------------------- //! Resize & Copy particles from particles_to_inject to the end of the vectors - virtual void copyParticles( Particles* particles_to_inject ); + virtual int addParticles( Particles* particles_to_inject ); //! Implementation of a somewhat efficient particle injection, sorting //! (including removing leaving particles) and binning for GPU if diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index 617cb0851..a45a56cbb 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include "Patch.h" @@ -308,7 +309,7 @@ namespace detail { particle_container.resize( new_count ); // Combine imported particles to main particles - particle_container.copyParticles( &particle_to_inject, initial_count ); + particle_container.pasteParticles( &particle_to_inject, initial_count ); // Compute keys of particles computeParticleClusterKey( particle_container, parameters, a_parent_patch ); @@ -924,52 +925,58 @@ void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pre if( nparts_to_copy ) { // Copy the particles to the destination - for( int ip = 0; ip < getNDoubleProp(); ip++ ) { - const auto in = getPtrDoubleProp( ip ); - const auto out = dest->getPtrDoubleProp( ip ); - thrust::copy_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, out, pred ); + for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { + const auto in = nvidia_double_prop_[ip]->begin(); + const auto out = dest->nvidia_double_prop_[ip]->begin(); + thrust::copy_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, out, pred ); } - for( int ip = 0; ip < getNShortProp(); ip++ ) { - const auto in = getPtrShortProp( ip ); - const auto out = dest->getPtrShortProp( ip ); - thrust::copy_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, out, pred ); + for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { + const auto in = nvidia_short_prop_[ip]->begin(); + const auto out = dest->nvidia_short_prop_[ip]->begin(); + thrust::copy_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, out, pred ); } if( tracked ) { - const auto in = getPtrId(); - const auto out = dest->getPtrId(); - thrust::copy_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, out, pred ); + const auto in = nvidia_id_.begin(); + const auto out = dest->nvidia_id_.begin(); + thrust::copy_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, out, pred ); } - cudaDeviceSynchronize(); + const auto in = nvidia_cell_keys_.begin(); + const auto out = dest->nvidia_cell_keys_.begin(); + thrust::copy_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, out, pred ); + SMILEI_ACCELERATOR_DEVICE_SYNC(); + ::hipDeviceSynchronize(); } } -void nvidiaParticles::copyParticles( Particles* particles_to_inject ) +int nvidiaParticles::addParticles( Particles* particles_to_inject ) { const auto nparts = gpu_nparts_; nvidiaParticles* to_inject = static_cast( particles_to_inject ); resize( nparts + to_inject->gpu_nparts_ ); - copyParticles( to_inject, nparts ); + pasteParticles( to_inject, nparts ); + return to_inject->gpu_nparts_; } -void nvidiaParticles::copyParticles( nvidiaParticles* particles_to_inject, size_t offset ) +void nvidiaParticles::pasteParticles( nvidiaParticles* particles_to_inject, size_t offset ) { // Copy the particles to the destination - for( int ip = 0; ip < getNDoubleProp(); ip++ ) { - const auto in = particles_to_inject->getPtrDoubleProp( ip ); - const auto out = getPtrDoubleProp( ip ); - thrust::copy_n( thrust::cuda::par_nosync, in, particles_to_inject->gpu_nparts_, out + offset ); + for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { + const auto in = particles_to_inject->nvidia_double_prop_[ip]->begin(); + const auto out = nvidia_double_prop_[ip]->begin(); + thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, particles_to_inject->gpu_nparts_, out + offset ); } - for( int ip = 0; ip < getNShortProp(); ip++ ) { - const auto in = particles_to_inject->getPtrShortProp( ip ); - const auto out = getPtrShortProp( ip ); - thrust::copy_n( thrust::cuda::par_nosync, in, particles_to_inject->gpu_nparts_, out + offset ); + for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { + const auto in = particles_to_inject->nvidia_short_prop_[ip]->begin(); + const auto out = nvidia_short_prop_[ip]->begin(); + thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, particles_to_inject->gpu_nparts_, out + offset ); } if( tracked ) { - const auto in = particles_to_inject->getPtrId(); - const auto out = getPtrId(); - thrust::copy_n( thrust::cuda::par_nosync, in, particles_to_inject->gpu_nparts_, out + offset ); + const auto in = particles_to_inject->nvidia_id_.begin(); + const auto out = nvidia_id_.begin(); + thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, particles_to_inject->gpu_nparts_, out + offset ); } - cudaDeviceSynchronize(); + SMILEI_ACCELERATOR_DEVICE_SYNC(); + ::hipDeviceSynchronize(); } // ----------------------------------------------------------------------------- @@ -1016,20 +1023,21 @@ int nvidiaParticles::eraseParticlesByPredicate( Predicate pred ) // Copy the particles to the destination // Using more memory, we could use the faster remove_copy_if // NOTE: remove_if is stable. - for( int ip = 0; ip < getNDoubleProp(); ip++ ) { - const auto in = getPtrDoubleProp( ip ); - thrust::remove_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, pred ); + for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { + const auto in = nvidia_double_prop_[ip]->begin(); + thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, pred ); } - for( int ip = 0; ip < getNShortProp(); ip++ ) { - const auto in = getPtrShortProp( ip ); - thrust::remove_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, pred ); + for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { + const auto in = nvidia_short_prop_[ip]->begin(); + thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, pred ); } if( tracked ) { - const auto in = getPtrId(); - thrust::remove_if( thrust::cuda::par_nosync, in, in + gpu_nparts_, keys, pred ); + const auto in = nvidia_id_.begin(); + thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, pred ); } - cudaDeviceSynchronize(); - + SMILEI_ACCELERATOR_DEVICE_SYNC(); + ::hipDeviceSynchronize(); + return nparts_to_remove; } @@ -1094,21 +1102,21 @@ void nvidiaParticles::sortParticleByKey() // Sort particles using thrust::gather, according to the sorting map thrust::device_vector buffer( gpu_nparts_ ); - for( int ip = 0; ip < getNDoubleProp(); ip++ ) { - thrust::gather( thrust::device, index.begin(), index.end(), getPtrDoubleProp( ip ), buffer.begin() ); - swapDoubleProp( ip, buffer ); + for( auto prop: nvidia_double_prop_ ) { + thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer.begin() ); + prop->swap( buffer ); } buffer.clear(); thrust::device_vector buffer_short( gpu_nparts_ ); - for( int ip = 0; ip < getNShortProp(); ip++ ) { - thrust::gather( thrust::device, index.begin(), index.end(), getPtrShortProp( ip ), buffer_short.begin() ); - swapShortProp( ip, buffer_short ); + for( auto prop: nvidia_short_prop_ ) { + thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer_short.begin() ); + prop->swap( buffer_short ); } buffer_short.clear(); if( tracked ) { thrust::device_vector buffer_uint64( gpu_nparts_ ); - thrust::gather( thrust::device, index.begin(), index.end(), getPtrId(), buffer_uint64.begin() ); - swapId( buffer_uint64 ); + thrust::gather( thrust::device, index.begin(), index.end(), nvidia_id_.begin(), buffer_uint64.begin() ); + nvidia_id_.swap( buffer_uint64 ); buffer_uint64.clear(); } } @@ -1123,16 +1131,17 @@ void nvidiaParticles::sortParticleByKey( nvidiaParticles& buffer ) thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); // Sort particles using thrust::gather, according to the sorting map - for( int ip = 0; ip < getNDoubleProp(); ip++ ) { - thrust::gather( thrust::cuda::par_nosync, index.begin(), index.end(), getPtrDoubleProp( ip ), buffer.getPtrDoubleProp( ip ) ); + for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_double_prop_[ip]->begin(), buffer.nvidia_double_prop_[ip]->begin() ); } - for( int ip = 0; ip < getNShortProp(); ip++ ) { - thrust::gather( thrust::cuda::par_nosync, index.begin(), index.end(), getPtrShortProp( ip ), buffer.getPtrShortProp( ip ) ); + for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_short_prop_[ip]->begin(), buffer.nvidia_short_prop_[ip]->begin() ); } if( tracked ) { - thrust::gather( thrust::cuda::par_nosync, index.begin(), index.end(), getPtrId(), buffer.getPtrId() ); + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_id_.begin(), buffer.nvidia_id_.begin() ); } - cudaDeviceSynchronize(); + SMILEI_ACCELERATOR_DEVICE_SYNC(); + ::hipDeviceSynchronize(); swap( buffer ); } @@ -1202,7 +1211,7 @@ void nvidiaParticles::naiveImportAndSortParticles( nvidiaParticles* particles_to // Inject newly arrived particles in particles_to_inject const size_t current_size = gpu_nparts_; resize( current_size + particles_to_inject->size() ); - copyParticles( particles_to_inject, current_size ); + pasteParticles( particles_to_inject, current_size ); particles_to_inject->clear(); } diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index 906d3709e..1938da107 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -111,36 +111,12 @@ class nvidiaParticles : public Particles uint64_t * getPtrId() override { return thrust::raw_pointer_cast( nvidia_id_.data() ); }; - - size_t getNDoubleProp() { - return nvidia_double_prop_.size(); - }; - size_t getNShortProp() { - return nvidia_short_prop_.size(); - }; - - double * getPtrDoubleProp( int iprop ) { - return thrust::raw_pointer_cast( nvidia_double_prop_[iprop]->data() ); - }; - short * getPtrShortProp( int iprop ) { - return thrust::raw_pointer_cast( nvidia_short_prop_[iprop]->data() ); - }; - - void swapDoubleProp( int iprop, thrust::device_vector &new_vector ) { - nvidia_double_prop_[iprop]->swap( new_vector ); - }; - void swapShortProp( int iprop, thrust::device_vector &new_vector ) { - nvidia_short_prop_[iprop]->swap( new_vector ); - }; - void swapId( thrust::device_vector &new_vector ) { - nvidia_id_.swap( new_vector ); - }; - + void swap( nvidiaParticles & p ) { - for( int iprop = 0; iprop < getNDoubleProp(); iprop++ ) { + for( int iprop = 0; iprop < nvidia_double_prop_.size(); iprop++ ) { nvidia_double_prop_[iprop]->swap( *p.nvidia_double_prop_[iprop] ); } - for( int iprop = 0; iprop < getNShortProp(); iprop++ ) { + for( int iprop = 0; iprop < nvidia_short_prop_.size(); iprop++ ) { nvidia_short_prop_[iprop]->swap( *p.nvidia_short_prop_[iprop] ); } if( tracked ) { @@ -157,10 +133,10 @@ class nvidiaParticles : public Particles void copyParticlesByPredicate( Particles* buffer, Predicate pred ); //! Resize & Copy particles from particles_to_inject to end of vectors - void copyParticles( Particles* particles_to_inject ) override; + int addParticles( Particles* particles_to_inject ) override; //! Copy particles from particles_to_inject to specific offset - void copyParticles( nvidiaParticles* particles_to_inject, size_t offset ); + void pasteParticles( nvidiaParticles* particles_to_inject, size_t offset ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device and returns the number of particle removed diff --git a/src/Species/Species.cpp b/src/Species/Species.cpp index 19b39c2ed..31e127876 100755 --- a/src/Species/Species.cpp +++ b/src/Species/Species.cpp @@ -2103,10 +2103,9 @@ void Species::importParticles( Params ¶ms, Patch *patch, Particles &source_p // Warning: the current GPU version does not handle tracked particles // Inject particles from source_particles - particles->copyParticles( &source_particles ); - particles->last_index.back() += source_particles.size(); + particles->last_index.back() += particles->addParticles( &source_particles ); particles->last_index[0] = particles->last_index.back(); - source_particles.clear(); + source_particles.resize( 0 ); #else // --------------------------------------------------- diff --git a/src/Tools/gpu.h b/src/Tools/gpu.h index 28a8c98da..172d9fddf 100644 --- a/src/Tools/gpu.h +++ b/src/Tools/gpu.h @@ -19,10 +19,14 @@ namespace smilei { #define SMILEI_ACCELERATOR_DECLARE_ROUTINE _Pragma( "omp declare target" ) #define SMILEI_ACCELERATOR_DECLARE_ROUTINE_END _Pragma( "omp end declare target" ) #define SMILEI_ACCELERATOR_ATOMIC _Pragma( "omp atomic update" ) + #define SMILEI_ACCELERATOR_ASYNC_POLYCY thrust::hip::par_nosync + #define SMILEI_ACCELERATOR_DEVICE_SYNC() hipDeviceSynchronize() #elif defined( SMILEI_OPENACC_MODE ) #define SMILEI_ACCELERATOR_DECLARE_ROUTINE _Pragma( "acc routine seq" ) #define SMILEI_ACCELERATOR_DECLARE_ROUTINE_END #define SMILEI_ACCELERATOR_ATOMIC _Pragma( "acc atomic" ) + #define SMILEI_ACCELERATOR_ASYNC_POLYCY thrust::cuda::par_nosync + #define SMILEI_ACCELERATOR_DEVICE_SYNC() cudaDeviceSynchronize() #else #define SMILEI_ACCELERATOR_DECLARE_ROUTINE #define SMILEI_ACCELERATOR_DECLARE_ROUTINE_END From 93ba8e1431b358bfe65631f39d2b05e2f326c50b Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Wed, 22 May 2024 15:54:24 +0200 Subject: [PATCH 30/54] typos --- src/Particles/nvidiaParticles.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index a45a56cbb..c8e46afd2 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -944,7 +944,6 @@ void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pre const auto out = dest->nvidia_cell_keys_.begin(); thrust::copy_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, out, pred ); SMILEI_ACCELERATOR_DEVICE_SYNC(); - ::hipDeviceSynchronize(); } } @@ -976,7 +975,6 @@ void nvidiaParticles::pasteParticles( nvidiaParticles* particles_to_inject, size thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, particles_to_inject->gpu_nparts_, out + offset ); } SMILEI_ACCELERATOR_DEVICE_SYNC(); - ::hipDeviceSynchronize(); } // ----------------------------------------------------------------------------- @@ -1036,7 +1034,6 @@ int nvidiaParticles::eraseParticlesByPredicate( Predicate pred ) thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, pred ); } SMILEI_ACCELERATOR_DEVICE_SYNC(); - ::hipDeviceSynchronize(); return nparts_to_remove; } @@ -1141,7 +1138,6 @@ void nvidiaParticles::sortParticleByKey( nvidiaParticles& buffer ) thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_id_.begin(), buffer.nvidia_id_.begin() ); } SMILEI_ACCELERATOR_DEVICE_SYNC(); - ::hipDeviceSynchronize(); swap( buffer ); } From 9f362f3d3347d7ef58c903eed4dd67d3f4672eb2 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Fri, 24 May 2024 12:37:17 +0200 Subject: [PATCH 31/54] Slightly faster GPU sort --- src/Particles/nvidiaParticles.cu | 109 ++++++++++++++++++------------- src/Particles/nvidiaParticles.h | 6 +- 2 files changed, 69 insertions(+), 46 deletions(-) diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index c8e46afd2..42995603d 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -293,23 +293,34 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ) { - // Remove out of bound particles - const auto erased_count = particle_container.eraseParticlesByPredicate( cellKeyBelow<0>() ); - - const auto initial_count = particle_container.deviceSize() - erased_count; + const auto initial_count = particle_container.deviceSize(); const auto inject_count = particle_to_inject.deviceSize(); - const auto new_count = initial_count + inject_count; + + // Locate out-of-bounds particles in array "available_places" + const auto keys = particle_container.getPtrCellKeys(); + const auto erased_count = thrust::count_if( thrust::device, keys, keys + initial_count, cellKeyBelow<0>() ); + thrust::device_vector available_places( erased_count ); + thrust::copy_if( thrust::device, + thrust::counting_iterator{0}, + thrust::counting_iterator{ (int) initial_count }, + keys, + available_places.begin(), + cellKeyBelow<0>() ); - // Resize particles - // NOTE: We really want a non-initializing vector here! - // It's possible to give a custom allocator to thrust::device_vector. - // Create one with construct(<>) as a noop and derive from - // thrust::device_malloc_allocator. For now we do an explicit resize. - particle_container.softReserve( new_count ); - particle_container.resize( new_count ); + const auto new_count = initial_count + inject_count - erased_count; - // Combine imported particles to main particles - particle_container.pasteParticles( &particle_to_inject, initial_count ); + // Copy the imported particles to available places + particle_to_inject.scatterParticles( particle_container, available_places ); + // If there are more imported particles than places, copy the remaining imported particles at the end + if( inject_count >= erased_count ) { + particle_container.resize( new_count ); + particle_container.pasteParticles( &particle_to_inject, initial_count, erased_count ); + // If there are more places than imported particles, the remaining places should be filled + } else { + const auto last_filled = available_places[inject_count]; + particle_container.eraseParticlesByPredicate( cellKeyBelow<0>(), last_filled ); + particle_container.resize( new_count ); + } // Compute keys of particles computeParticleClusterKey( particle_container, parameters, a_parent_patch ); @@ -319,17 +330,11 @@ namespace detail { particle_to_inject.resize( new_count ); // Sort particles using thrust::gather, according to the sorting map + // (particle_to_inject serves as a buffer) particle_container.sortParticleByKey( particle_to_inject ); // Recompute bins computeBinIndex( particle_container ); - - // This free generates a lot of memory fragmentation. If we enable it we - // reduce significantly the memory usage over time but a memory spike - // will still be present. Unfortunately, this free generates soo much - // fragmentation (like the one above) that at some point the GPU memory - // allocator will fail! - // particle_to_inject.free(); } template void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pred ) { - // TODO(Etienne M): We are doing extra work. We could use something like - // std::partition to output the invalidated particles in buffer - // and keep the good ones. This would help us avoid the std::remove_if in - // the particle injection and sorting algorithm. - // Count particles satisfying the predicate const auto keys = getPtrCellKeys(); const int nparts_to_copy = thrust::count_if( thrust::device, keys, keys + gpu_nparts_, pred ); @@ -952,27 +952,29 @@ int nvidiaParticles::addParticles( Particles* particles_to_inject ) const auto nparts = gpu_nparts_; nvidiaParticles* to_inject = static_cast( particles_to_inject ); resize( nparts + to_inject->gpu_nparts_ ); - pasteParticles( to_inject, nparts ); + pasteParticles( to_inject, nparts, 0 ); return to_inject->gpu_nparts_; } -void nvidiaParticles::pasteParticles( nvidiaParticles* particles_to_inject, size_t offset ) +void nvidiaParticles::pasteParticles( nvidiaParticles* particles_to_inject, size_t offset_in_output, size_t offset_in_input ) { + const auto n = particles_to_inject->gpu_nparts_ - (int) offset_in_input; + // Copy the particles to the destination for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { - const auto in = particles_to_inject->nvidia_double_prop_[ip]->begin(); - const auto out = nvidia_double_prop_[ip]->begin(); - thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, particles_to_inject->gpu_nparts_, out + offset ); + const auto in = particles_to_inject->nvidia_double_prop_[ip]->begin() + offset_in_input; + const auto out = nvidia_double_prop_[ip]->begin() + offset_in_output; + thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, n, out ); } for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { - const auto in = particles_to_inject->nvidia_short_prop_[ip]->begin(); - const auto out = nvidia_short_prop_[ip]->begin(); - thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, particles_to_inject->gpu_nparts_, out + offset ); + const auto in = particles_to_inject->nvidia_short_prop_[ip]->begin() + offset_in_input; + const auto out = nvidia_short_prop_[ip]->begin() + offset_in_output; + thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, n, out ); } if( tracked ) { - const auto in = particles_to_inject->nvidia_id_.begin(); - const auto out = nvidia_id_.begin(); - thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, particles_to_inject->gpu_nparts_, out + offset ); + const auto in = particles_to_inject->nvidia_id_.begin() + offset_in_input; + const auto out = nvidia_id_.begin() + offset_in_output; + thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, n, out ); } SMILEI_ACCELERATOR_DEVICE_SYNC(); } @@ -1006,32 +1008,32 @@ void nvidiaParticles::pasteParticles( nvidiaParticles* particles_to_inject, size // ----------------------------------------------------------------------------- int nvidiaParticles::eraseLeavingParticles() { - const auto nremoved = eraseParticlesByPredicate( cellKeyBelow<0>() ); + const auto nremoved = eraseParticlesByPredicate( cellKeyBelow<0>(), 0 ); resize( gpu_nparts_ - nremoved ); return nremoved; } //! "Erase" particles but does not resize the arrays! template -int nvidiaParticles::eraseParticlesByPredicate( Predicate pred ) +int nvidiaParticles::eraseParticlesByPredicate( Predicate pred, size_t offset ) { const auto keys = getPtrCellKeys(); - const int nparts_to_remove = thrust::count_if( thrust::device, keys, keys + gpu_nparts_, pred ); + const int nparts_to_remove = thrust::count_if( thrust::device, keys + offset, keys + gpu_nparts_, pred ); // Copy the particles to the destination // Using more memory, we could use the faster remove_copy_if // NOTE: remove_if is stable. for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { const auto in = nvidia_double_prop_[ip]->begin(); - thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, pred ); + thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in + offset, in + gpu_nparts_, keys + offset, pred ); } for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { const auto in = nvidia_short_prop_[ip]->begin(); - thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, pred ); + thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in + offset, in + gpu_nparts_, keys + offset, pred ); } if( tracked ) { const auto in = nvidia_id_.begin(); - thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, pred ); + thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in + offset, in + gpu_nparts_, keys + offset, pred ); } SMILEI_ACCELERATOR_DEVICE_SYNC(); @@ -1142,6 +1144,25 @@ void nvidiaParticles::sortParticleByKey( nvidiaParticles& buffer ) swap( buffer ); } + +void nvidiaParticles::scatterParticles( nvidiaParticles &dest, const thrust::device_vector &index ) +{ + const auto n = std::min( (int) index.size(), gpu_nparts_ ); + for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { + const auto in = nvidia_double_prop_[ip]->begin(); + thrust::scatter( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + n, index.begin(), dest.nvidia_double_prop_[ip]->begin() ); + } + for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { + const auto in = nvidia_short_prop_[ip]->begin(); + thrust::scatter( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + n, index.begin(), dest.nvidia_short_prop_[ip]->begin() ); + } + if( tracked ) { + const auto in = nvidia_id_.begin(); + thrust::scatter( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + n, index.begin(), dest.nvidia_id_.begin() ); + } + SMILEI_ACCELERATOR_DEVICE_SYNC(); +} + int nvidiaParticles::prepareBinIndex() { if( first_index.size() == 0 ) { @@ -1207,7 +1228,7 @@ void nvidiaParticles::naiveImportAndSortParticles( nvidiaParticles* particles_to // Inject newly arrived particles in particles_to_inject const size_t current_size = gpu_nparts_; resize( current_size + particles_to_inject->size() ); - pasteParticles( particles_to_inject, current_size ); + pasteParticles( particles_to_inject, current_size, 0 ); particles_to_inject->clear(); } diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index 1938da107..37b3fc18d 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -136,7 +136,7 @@ class nvidiaParticles : public Particles int addParticles( Particles* particles_to_inject ) override; //! Copy particles from particles_to_inject to specific offset - void pasteParticles( nvidiaParticles* particles_to_inject, size_t offset ); + void pasteParticles( nvidiaParticles* particles_to_inject, size_t offset_out, size_t offset_in ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device and returns the number of particle removed @@ -144,7 +144,7 @@ class nvidiaParticles : public Particles int eraseLeavingParticles() override; template - int eraseParticlesByPredicate( Predicate pred ); + int eraseParticlesByPredicate( Predicate pred, size_t offset ); // --------------------------------------------------------------------------------------------------------------------- //! Create n_additional_particles new particles at the end of vectors @@ -161,6 +161,8 @@ class nvidiaParticles : public Particles //! This version is asynchronous, but requires a buffer of equal size to be provided void sortParticleByKey( nvidiaParticles& buffer ); + void scatterParticles( nvidiaParticles &particles_to_import, const thrust::device_vector &index ); + protected: //! Redefine first_index and last_index according to the binning algorithm //! used on GPU. From 3074a9fab7ecbd37ce74b6207f544daf0e6c2d9f Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Fri, 24 May 2024 14:35:24 +0200 Subject: [PATCH 32/54] add publication --- doc/Sphinx/Overview/material.rst | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/doc/Sphinx/Overview/material.rst b/doc/Sphinx/Overview/material.rst index 33184146f..66ed26180 100644 --- a/doc/Sphinx/Overview/material.rst +++ b/doc/Sphinx/Overview/material.rst @@ -30,7 +30,7 @@ Papers involving Smilei ^^^^^^^^^^^^^^^^^^^^^^^^ Only papers published in peer-reviewed journals are listed (for the complete list of citing papers see `Google Scholar `_). -As of May 2024, 188 papers have been published covering a broad range of topics: +As of May 2024, 189 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -50,6 +50,12 @@ Following is the distribution of these topics in the listed publications up to N Use the python script doc/doi2publications.py to generate entries from a DOI number, and paste them here You can count the number of papers in the list with the vim command :%s/.. \[//gn. +.. [Krafft2024b] + + C. Krafft, P. Savoini, and F. J. Polanco-Rodríguez, + `Mechanisms of Fundamental Electromagnetic Wave Radiation in the Solar Wind`, + `The Astrophysical Journal Letters 967, 2 (2024) `_ + .. [Salgado2024] F. C. Salgado, A. Kozan, D. Seipt, D. Hollatz, P. Hilz, M. Kaluza, A. Sävert, A. Seidel, D. Ullmann, Y. Zhao, and M. Zepf, @@ -92,7 +98,7 @@ Following is the distribution of these topics in the listed publications up to N `Control of autoresonant plasma beat-wave wakefield excitation`, `Physical Review Research 6, 013338 (2024) `_ -.. [Krafft2024] +.. [Krafft2024a] C. Krafft and P. Savoini, `Electrostatic Wave Decay in the Randomly Inhomogeneous Solar Wind`, @@ -532,7 +538,7 @@ Following is the distribution of these topics in the listed publications up to N .. [Pae2022] - K. . Pae, C. M. Kim, V. B. Pathak, C.-M. Ryu and C. H. Nam, + K. H. Pae, C. M. Kim, V. B. Pathak, C.-M. Ryu and C. H. Nam, `Direct laser acceleration of electrons from a plasma mirror by an intense few-cycle Laguerre–Gaussian laser and its dependence on the carrier-envelope phase`, `Plasma Physics and Controlled Fusion 64, 055013 (2022) `_ @@ -769,7 +775,7 @@ Following is the distribution of these topics in the listed publications up to N .. [Psikal2021] - J Psikal, + J. Psikal, `Laser-driven ion acceleration from near-critical Gaussian plasma density profile`, `Plasma Physics and Controlled Fusion 63, 064002 (2021) `_ @@ -787,7 +793,7 @@ Following is the distribution of these topics in the listed publications up to N .. [Golovanov2021] - A A Golovanov, I Yu Kostyukov, L Reichwein, J Thomas and A Pukhov, + A. A. Golovanov, I. Y. Kostyukov, L. Reichwein, J. Thomas and A. Pukhov, `Excitation of strongly nonlinear plasma wakefield by electron bunches`, `Plasma Physics and Controlled Fusion 63, 085004 (2021) `_ From 1045fd21bda84117a21c6118f01dd23d36e0ce73 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Mon, 27 May 2024 09:38:48 +0200 Subject: [PATCH 33/54] Various small fixes --- doc/Sphinx/Use/namelist.rst | 8 ++++++-- happi/_Diagnostics/TrackParticles.py | 15 ++++++++++----- happi/_Utils.py | 6 +++++- src/SmileiMPI/AsyncMPIbuffers.h | 2 +- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/doc/Sphinx/Use/namelist.rst b/doc/Sphinx/Use/namelist.rst index f7deebcae..6c5eaf2be 100755 --- a/doc/Sphinx/Use/namelist.rst +++ b/doc/Sphinx/Use/namelist.rst @@ -1148,6 +1148,9 @@ Each species has to be defined in a ``Species`` block:: :ref:`tracking `. The available fields are ``"Ex"``, ``"Ey"``, ``"Ez"``, ``"Bx"``, ``"By"`` and ``"Bz"``. + Note that magnetic field components, as they originate from the interpolator, + are shifted by half a timestep compared to those from the *Fields* diagnostics. + Additionally, the work done by each component of the electric field is available as ``"Wx"``, ``"Wy"`` and ``"Wz"``. Contrary to the other interpolated fields, these quantities are accumulated over time. @@ -2716,7 +2719,8 @@ or several points arranged in a 2-D or 3-D grid. * **In "AMcylindrical" geometry**, probes are defined with 3D Cartesian coordinates and cannot be separated per mode. Use Field diagnostics for cylindrical coordinates and information per mode. - + * **Probes rely on the particle interpolator to compute fields** so that the + magnetic field is shifted by half a timestep compared to that of *Fields* diagnostics. To add one probe diagnostic, include the block ``DiagProbe``:: @@ -3343,7 +3347,7 @@ for instance:: def my_filter(particles): return (particles.px>-1.)*(particles.px<1.) + (particles.pz>3.) -.. Warning:: The ``px``, ``py`` and ``pz`` quantities are not exactly the momenta. +.. Note:: The ``px``, ``py`` and ``pz`` quantities are not exactly the momenta. They are actually the velocities multiplied by the lorentz factor, i.e., :math:`\gamma v_x`, :math:`\gamma v_y` and :math:`\gamma v_z`. This is true only inside the ``filter`` function (not for the output of the diagnostic). diff --git a/happi/_Diagnostics/TrackParticles.py b/happi/_Diagnostics/TrackParticles.py index 253bb2958..0825eb0f3 100755 --- a/happi/_Diagnostics/TrackParticles.py +++ b/happi/_Diagnostics/TrackParticles.py @@ -447,8 +447,9 @@ def _orderFiles( self, fileOrdered, chunksize, sort ): for k, name in self._short_properties_from_raw.items(): if k not in group: continue ordered = self._np.empty((nparticles_to_write, ), dtype=group[k].dtype) - if k == "id": ordered.fill(0) - else : ordered.fill(self._np.nan) + if k == "id" : ordered.fill(0) + elif k == "charge": ordered.fill(9999) + else : ordered.fill(self._np.nan) ordered[locs] = group[k][()][selectedIndices] f0[name].write_direct(ordered, dest_sel=self._np.s_[it,:]) @@ -461,8 +462,9 @@ def _orderFiles( self, fileOrdered, chunksize, sort ): for first_o, last_o, npart_o in ChunkedRange(nparticles_to_write, chunksize): for k, name in self._short_properties_from_raw.items(): if k not in group: continue - if k == "id": data[k].fill(0) - else : data[k].fill(self._np.nan) + if k == "id" : data[k].fill(0) + elif k == "charge": data[k].fill(9999) + else : data[k].fill(self._np.nan) # Loop chunks of the input for first_i, last_i, npart_i in ChunkedRange(nparticles, chunksize): # Obtain IDs @@ -538,7 +540,10 @@ def _generateRawData(self, times=None): data[it,:] -= self._XmovedForTime[time] else: data = self._readUnstructuredH5(self._h5items[axis], self.selectedParticles, first_time, last_time) - data[deadParticles] = self._np.nan + if data.dtype == float: + data[deadParticles] = self._np.nan + else: + data[deadParticles] = 9999 self._rawData[axis] = data if self._verbose: print("Process broken lines ...") diff --git a/happi/_Utils.py b/happi/_Utils.py index 9fd35a757..28dd028df 100755 --- a/happi/_Utils.py +++ b/happi/_Utils.py @@ -398,7 +398,11 @@ def __init__(self, operation, QuantityTranslator, ureg): raise Exception("Quantity "+q+" not understood") # Calculate the total units and its inverse locals().update(self.imports) - units = eval("".join(basic_op)).units + units = eval("".join(basic_op)) + if isinstance(units, (int, float)): + units = ureg.Quantity(1) # dimensionless + else: + units = units.units self.translated_units = units.format_babel(locale="en") # Make the operation string self.translated_operation = "".join(full_op) diff --git a/src/SmileiMPI/AsyncMPIbuffers.h b/src/SmileiMPI/AsyncMPIbuffers.h index 7b3cf1fcc..90ba02fb1 100755 --- a/src/SmileiMPI/AsyncMPIbuffers.h +++ b/src/SmileiMPI/AsyncMPIbuffers.h @@ -17,7 +17,7 @@ class AsyncMPIbuffers AsyncMPIbuffers(); ~AsyncMPIbuffers(); - virtual void allocate( unsigned int nDim_field ); + void allocate( unsigned int nDim_field ); void defineTags( Patch *patch, SmileiMPI *smpi, int tag ) ; From a48d556b6dcffb9042342fa53efec27c9f53f33b Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Mon, 27 May 2024 12:44:49 +0200 Subject: [PATCH 34/54] fix many warnings --- makefile | 8 +- scripts/compile_tools/machine/adastra | 1 - scripts/compile_tools/machine/ruche_gpu2 | 2 +- src/Checkpoint/Checkpoint.cpp | 2 +- src/Diagnostic/DiagnosticProbes.cpp | 2 +- src/Diagnostic/DiagnosticScalar.cpp | 22 +-- src/Diagnostic/DiagnosticTrack.cpp | 2 +- src/ElectroMagn/ElectroMagn.cpp | 14 +- src/ElectroMagn/ElectroMagn.h | 2 +- src/ElectroMagn/ElectroMagn1D.cpp | 2 +- src/ElectroMagn/ElectroMagn1D.h | 2 +- src/ElectroMagn/ElectroMagn2D.cpp | 32 ++-- src/ElectroMagn/ElectroMagn2D.h | 2 +- src/ElectroMagn/ElectroMagn3D.cpp | 32 ++-- src/ElectroMagn/ElectroMagn3D.h | 2 +- src/ElectroMagn/ElectroMagnAM.cpp | 6 +- src/ElectroMagn/ElectroMagnAM.h | 2 +- src/ElectroMagnBC/ElectroMagnBC2D_SM.cpp | 16 +- src/ElectroMagnBC/ElectroMagnBC3D_SM.cpp | 26 +-- .../MA_Solver1D_Friedman.cpp | 2 +- src/ElectroMagnSolver/MA_Solver2D_norm.cpp | 18 +- src/ElectroMagnSolver/MA_Solver3D_norm.cpp | 18 +- src/ElectroMagnSolver/MF_Solver2D_Yee.cpp | 18 +- src/ElectroMagnSolver/MF_Solver3D_Yee.cpp | 18 +- .../PML_Solver2D_Envelope.cpp | 8 +- .../PML_SolverAM_Envelope.cpp | 9 +- ...PML_SolverAM_EnvelopeReducedDispersion.cpp | 7 +- src/Field/Field.cpp | 6 +- src/Field/Field.h | 8 +- src/Field/Field1D.cpp | 2 +- src/Field/Field1D.h | 2 +- src/Field/Field2D.cpp | 26 +-- src/Field/Field2D.h | 2 +- src/Field/Field3D.cpp | 38 ++-- src/Field/Field3D.h | 2 +- src/Field/cField.h | 2 +- src/Field/cField1D.cpp | 2 +- src/Field/cField1D.h | 2 +- src/Field/cField2D.cpp | 2 +- src/Field/cField2D.h | 2 +- src/Field/cField3D.cpp | 2 +- src/Field/cField3D.h | 2 +- src/Interpolator/Interpolator2D2Order.cpp | 10 +- src/Interpolator/Interpolator3D2Order.cpp | 12 +- src/Interpolator/Interpolator3D2Order.h | 2 +- src/MovWindow/SimWindow.cpp | 4 +- .../MultiphotonBreitWheeler.cpp | 26 +-- .../MultiphotonBreitWheeler.h | 2 +- .../MultiphotonBreitWheelerTables.h | 4 +- src/Params/Params.cpp | 34 ++-- src/Params/Params.h | 2 +- src/ParticleBC/BoundaryConditionType.cpp | 24 +-- src/ParticleBC/PartBoundCond.h | 2 +- src/Particles/Particles.cpp | 6 +- src/Particles/ParticlesFactory.cpp | 4 +- src/Patch/Patch.cpp | 4 +- src/Patch/Patch.h | 2 +- src/Patch/SyncVectorPatch.cpp | 60 +++---- src/Patch/SyncVectorPatch.h | 30 ++-- src/Patch/VectorPatch.cpp | 64 ++++--- src/Patch/VectorPatch.h | 2 +- src/Projector/Projector2D2OrderGPU.cpp | 62 ++++--- src/Projector/Projector2D2OrderGPU.h | 30 ++-- src/Projector/Projector2D2OrderGPUKernel.cpp | 2 +- .../Projector2D2OrderGPUKernelCUDAHIP.cu | 4 +- .../Projector2D2OrderGPUKernelCUDAHIP.h | 2 +- src/Projector/Projector3D2OrderGPU.cpp | 169 ++++++++++-------- src/Projector/Projector3D2OrderGPU.cpp.backup | 32 ++-- src/Projector/Projector3D2OrderGPU.h | 30 ++-- src/Projector/Projector3D2OrderGPUKernel.cpp | 2 +- src/Projector/Projector3D2OrderGPUKernelAcc.h | 26 +-- .../Projector3D2OrderGPUKernelCUDAHIP.cu | 2 +- .../Projector3D2OrderGPUKernelCUDAHIP.h | 2 +- .../Projector3D2OrderGPUKernelNaive.h | 6 +- src/Projector/ProjectorAM2OrderV.cpp | 4 - src/Projector/ProjectorFactory.h | 4 +- src/Pusher/PusherBoris.cpp | 2 +- src/Pusher/PusherBorisNR.cpp | 2 +- src/Pusher/PusherHigueraCary.cpp | 2 +- src/Pusher/PusherPhoton.cpp | 2 +- src/Pusher/PusherPonderomotiveBoris.cpp | 2 +- src/Pusher/PusherPonderomotiveBorisBTIS3.cpp | 1 - .../PusherPonderomotivePositionBoris.cpp | 2 +- src/Pusher/PusherVay.cpp | 2 +- src/Radiation/RadiationCorrLandauLifshitz.cpp | 12 +- src/Radiation/RadiationLandauLifshitz.cpp | 12 +- src/Radiation/RadiationMonteCarlo.cpp | 30 ++-- src/Radiation/RadiationMonteCarlo.h | 2 +- src/Radiation/RadiationNiel.cpp | 32 ++-- src/Radiation/RadiationNiel.h | 2 +- src/Radiation/RadiationTables.h | 14 +- src/Radiation/RadiationTools.h | 14 +- src/Radiation/Table.h | 2 +- src/Smilei.cpp | 16 +- src/SmileiMPI/SmileiMPI.cpp | 14 +- src/SmileiMPI/SmileiMPI.h | 6 +- src/Species/Species.cpp | 63 ++++--- src/Species/Species.h | 6 +- src/Tools/Pragma.h | 2 +- src/Tools/gpu.cpp | 26 +-- src/Tools/gpu.h | 2 +- src/Tools/gpuRandom.h | 22 ++- src/Tools/userFunctions.h | 4 +- 103 files changed, 689 insertions(+), 653 deletions(-) diff --git a/makefile b/makefile index d06dfaccc..277a2237d 100755 --- a/makefile +++ b/makefile @@ -202,9 +202,9 @@ endif ifneq (,$(call parse_config,gpu_nvidia)) override config += noopenmp # Prevent openmp for nvidia - CXXFLAGS += -DSMILEI_ACCELERATOR_MODE -DSMILEI_OPENACC_MODE + CXXFLAGS += -DSMILEI_ACCELERATOR_GPU -DSMILEI_ACCELERATOR_GPU_OACC GPU_COMPILER ?= nvcc - GPU_COMPILER_FLAGS += -x cu -DSMILEI_ACCELERATOR_MODE -DSMILEI_OPENACC_MODE $(DIRS:%=-I%) + GPU_COMPILER_FLAGS += -x cu -DSMILEI_ACCELERATOR_GPU -DSMILEI_ACCELERATOR_GPU_OACC $(DIRS:%=-I%) GPU_COMPILER_FLAGS += -I$(BUILD_DIR)/src/Python $(PY_CXXFLAGS) GPU_KERNEL_SRCS := $(shell find src/* -name \*.cu) GPU_KERNEL_OBJS := $(addprefix $(BUILD_DIR)/, $(GPU_KERNEL_SRCS:.cu=.o)) @@ -214,9 +214,9 @@ endif # AMD GPUs ifneq (,$(call parse_config,gpu_amd)) - CXXFLAGS += -DSMILEI_ACCELERATOR_MODE + CXXFLAGS += -DSMILEI_ACCELERATOR_GPU -DSMILEI_ACCELERATOR_GPU_OMP GPU_COMPILER ?= $(CC) - GPU_COMPILER_FLAGS += -x hip -DSMILEI_ACCELERATOR_MODE -std=c++14 $(DIRS:%=-I%) + GPU_COMPILER_FLAGS += -x hip -DSMILEI_ACCELERATOR_GPU -DSMILEI_ACCELERATOR_GPU_OMP -std=c++14 $(DIRS:%=-I%) GPU_COMPILER_FLAGS += -I$(BUILD_DIR)/src/Python $(PY_CXXFLAGS) GPU_KERNEL_SRCS := $(shell find src/* -name \*.cu) GPU_KERNEL_OBJS := $(addprefix $(BUILD_DIR)/, $(GPU_KERNEL_SRCS:.cu=.o)) diff --git a/scripts/compile_tools/machine/adastra b/scripts/compile_tools/machine/adastra index 7aab184ce..14c2a975a 100644 --- a/scripts/compile_tools/machine/adastra +++ b/scripts/compile_tools/machine/adastra @@ -85,7 +85,6 @@ ADASTRA_DEBUG_FLAGS := -g -ggdb $(ADASTRA_DEBUG_SANITIZER_FLAGS) -v # ifneq (,$(call parse_config,gpu_amd)) # When using OMP - ADASTRA_ACCELERATOR_GPU_OMP_DEFINE_FLAGS := -DSMILEI_ACCELERATOR_GPU_OMP=1 # ADASTRA_ACCELERATOR_GPU_TARGET := gfx908 # ADASTRA_ACCELERATOR_GPU_TARGET := gfx908:xnack- diff --git a/scripts/compile_tools/machine/ruche_gpu2 b/scripts/compile_tools/machine/ruche_gpu2 index a9406d60d..80cf09198 100644 --- a/scripts/compile_tools/machine/ruche_gpu2 +++ b/scripts/compile_tools/machine/ruche_gpu2 @@ -26,7 +26,7 @@ GPU_COMPILER_FLAGS += -arch=sm_80 #sm_89 # first compile completely with sm_80 t CXXFLAGS += -Minfo=accel # what is offloaded/copied # CXXFLAGS += -Minfo=all # very verbose output -# To turn on the OpenMP support, uncomment these 3 lines and comment the line just above defining 'SMILEI_OPENACC_MODE' +# To turn on the OpenMP support, uncomment these 3 lines and comment the line just above defining 'SMILEI_ACCELERATOR_GPU_OACC' # CXXFLAGS += -mp=gpu -DSMILEI_ACCELERATOR_GPU_OMP # GPU_COMPILER_FLAGS += -DSMILEI_ACCELERATOR_GPU_OMP # Can't we pass the -mp=gpu to nvcc when compiling a .cu file ? # LDFLAGS += -mp=gpu diff --git a/src/Checkpoint/Checkpoint.cpp b/src/Checkpoint/Checkpoint.cpp index 13c3d28a5..943840cb9 100755 --- a/src/Checkpoint/Checkpoint.cpp +++ b/src/Checkpoint/Checkpoint.cpp @@ -233,7 +233,7 @@ void Checkpoint::dumpAll( VectorPatch &vecPatches, Region ®ion, unsigned int MESSAGE( " Checkpoint #" << num_dump << " at iteration " << itime << " dumped" ); #endif -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) MESSAGE( " Copying device data in main memory" ); // TODO(Etienne M): This may very well be redundant if we did a diagnostic // during the last iteration. Indeed, we copy everything from the device to diff --git a/src/Diagnostic/DiagnosticProbes.cpp b/src/Diagnostic/DiagnosticProbes.cpp index 5e79eecc9..e66c684e7 100755 --- a/src/Diagnostic/DiagnosticProbes.cpp +++ b/src/Diagnostic/DiagnosticProbes.cpp @@ -740,7 +740,7 @@ void DiagnosticProbes::run( SmileiMPI *smpi, VectorPatch &vecPatches, int itime, // Interpolate all usual fields on probe ("fake") particles of current patch unsigned int iPart_MPI = offset_in_MPI[ipatch]; unsigned int maxPart_MPI = offset_in_MPI[ipatch] + npart; -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) smpi->resizeDeviceBuffers( ithread, nDim_particle, npart ); diff --git a/src/Diagnostic/DiagnosticScalar.cpp b/src/Diagnostic/DiagnosticScalar.cpp index fe88f47d9..9b8b17409 100755 --- a/src/Diagnostic/DiagnosticScalar.cpp +++ b/src/Diagnostic/DiagnosticScalar.cpp @@ -436,7 +436,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) const unsigned int nPart=vecSpecies[ispec]->getNbrOfParticles(); // number of particles -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) const double *const __restrict__ weight_ptr = vecSpecies[ispec]->particles->getPtrWeight(); const short *const __restrict__ charge_ptr = vecSpecies[ispec]->particles->getPtrCharge(); const double *const __restrict__ momentum_x = vecSpecies[ispec]->particles->getPtrMomentum(0); @@ -447,14 +447,14 @@ void DiagnosticScalar::compute( Patch *patch, int ) if( vecSpecies[ispec]->mass_ > 0 ) { // GPU mode -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target teams distribute parallel for \ map(tofrom: density) \ is_device_ptr(weight_ptr) \ reduction(+:density) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(weight_ptr) #pragma acc loop gang worker vector reduction(+:density) #endif @@ -468,7 +468,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) map(tofrom: charge) \ is_device_ptr( charge_ptr, weight_ptr) \ reduction(+:charge) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(weight_ptr, charge_ptr) #pragma acc loop gang worker vector reduction(+:charge) #endif @@ -484,7 +484,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) momentum_y /* [istart:particle_number] */, \ momentum_z /* [istart:particle_number] */) \ reduction(+:ener_tot) -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc parallel deviceptr(weight_ptr, \ momentum_x, \ momentum_y, \ @@ -525,14 +525,14 @@ void DiagnosticScalar::compute( Patch *patch, int ) } else if( vecSpecies[ispec]->mass_ == 0 ) { // GPU mode -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target teams distribute parallel for \ map(tofrom: density) \ is_device_ptr(weight_ptr) \ reduction(+:density) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(weight_ptr) #pragma acc loop gang worker vector reduction(+:density) #endif @@ -548,7 +548,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) momentum_y /* [istart:particle_number] */, \ momentum_z /* [istart:particle_number] */) \ reduction(+:ener_tot) -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc parallel deviceptr(weight_ptr, \ momentum_x, \ momentum_y, \ @@ -667,7 +667,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) // total energy in current field double Uem = 0.; if( ! AM ) { -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) Uem = field->norm2OnDevice( EMfields->istart, EMfields->bufsize ); #else Uem = field->norm2( EMfields->istart, EMfields->bufsize ); @@ -751,7 +751,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) j_max = iFieldStart[1]; k_max = iFieldStart[2]; -#if defined( SMILEI_ACCELERATOR_MODE) +#if defined( SMILEI_ACCELERATOR_GPU) // We use scalar rather than arrays because omp target // sometime fails to pass them to the device const unsigned int ixstart = iFieldStart[0]; @@ -776,7 +776,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) map(tofrom: minval, maxval, i_min, i_max, j_min, j_max, k_min, k_max) \ map(to: ny, nz, ixstart, ixend, iystart, iyend, izstart, izend) //reduction(min:minval) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(field_data) //deviceptr( data_ ) #pragma acc loop gang worker vector collapse(3) #endif diff --git a/src/Diagnostic/DiagnosticTrack.cpp b/src/Diagnostic/DiagnosticTrack.cpp index 16ac325e9..583caab94 100755 --- a/src/Diagnostic/DiagnosticTrack.cpp +++ b/src/Diagnostic/DiagnosticTrack.cpp @@ -188,7 +188,7 @@ void DiagnosticTrack::setIDs( Patch *patch ) for( unsigned int iPart=0; iPartvecSpecies[species_index_]->particles->id( iPart ) = ++latest_Id; } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) patch->vecSpecies[species_index_]->particles->initializeIDsOnDevice(); #endif } diff --git a/src/ElectroMagn/ElectroMagn.cpp b/src/ElectroMagn/ElectroMagn.cpp index 2c75bc6a4..02467ecd4 100755 --- a/src/ElectroMagn/ElectroMagn.cpp +++ b/src/ElectroMagn/ElectroMagn.cpp @@ -555,7 +555,7 @@ void ElectroMagn::applyAntenna( unsigned int iAntenna, double intensity ) //! Compute the total density and currents from species density and currents on Device //! This function is valid wathever the geometry // --------------------------------------------------------------------------------------------------------------------- -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) void ElectroMagn::computeTotalRhoJOnDevice() { @@ -577,7 +577,7 @@ void ElectroMagn::computeTotalRhoJOnDevice() double *const __restrict__ rhosp = rho_s[ispec] ? rho_s[ispec]->data() : nullptr; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( \ Jxp[0:Jx_size], \ Jyp[0:Jy_size], \ @@ -594,7 +594,7 @@ void ElectroMagn::computeTotalRhoJOnDevice() #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc loop gang worker vector #endif for( unsigned int i=0 ; idata(); // Magnetic field Bx^(p,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofBx = Bx_->size(); const int sizeofBy = By_->size(); const int sizeofBz = Bz_->size(); @@ -1229,10 +1229,10 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_p; ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_d; ++y ) { @@ -1241,7 +1241,7 @@ void ElectroMagn2D::centerMagneticFields() } // Magnetic field By^(d,p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(By2D[0:sizeofBy],By2D_m[0:sizeofBy]) #pragma acc loop gang worker #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -1249,10 +1249,10 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < ( nx_p + 1 ); ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_p; ++y ) { @@ -1260,7 +1260,7 @@ void ElectroMagn2D::centerMagneticFields() } } // Magnetic field Bz^(d,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(Bz2D[0:sizeofBz],Bz2D_m[0:sizeofBz]) #pragma acc loop gang worker #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -1268,10 +1268,10 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < ( nx_p + 1 ); ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_d; ++y ) { @@ -1282,7 +1282,7 @@ void ElectroMagn2D::centerMagneticFields() double *const By2D_oldBTIS3 = By_mBTIS3->data(); double *const Bz2D_oldBTIS3 = Bz_mBTIS3->data(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofByBTIS3 = By_mBTIS3->size(); #pragma acc parallel present(By2D_oldBTIS3[0:sizeofByBTIS3],By2D[0:sizeofBy]) #pragma acc loop gang @@ -1291,17 +1291,17 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < ( nx_p - 1 ); ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_p; ++y ) { By2D_oldBTIS3[x * ny_p + y] = ( By2D[(x+1) * ny_p + y] + By2D_oldBTIS3[x * ny_p + y] ) * 0.5; } } -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofBzBTIS3 = Bz_mBTIS3->size(); #pragma acc parallel present(Bz2D_oldBTIS3[0:sizeofBz],Bz2D[0:sizeofBz]) #pragma acc loop gang @@ -1310,10 +1310,10 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < ( nx_p - 1 ); ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_d; ++y ) { @@ -1392,7 +1392,7 @@ void ElectroMagn2D::computeTotalRhoJ() //END computeTotalRhoJ } -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // //! Method used to compute the total charge density and currents by summing over all species on Device // void ElectroMagn2D::computeTotalRhoJOnDevice() // { diff --git a/src/ElectroMagn/ElectroMagn2D.h b/src/ElectroMagn/ElectroMagn2D.h index aecb87ab8..d8cdfb031 100755 --- a/src/ElectroMagn/ElectroMagn2D.h +++ b/src/ElectroMagn/ElectroMagn2D.h @@ -115,7 +115,7 @@ class ElectroMagn2D : public ElectroMagn //! Method used to compute the total charge density and currents by summing over all species void computeTotalRhoJ() override; -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // //! Method used to compute the total charge density and currents by summing over all species on Device // void computeTotalRhoJOnDevice() override; // #endif diff --git a/src/ElectroMagn/ElectroMagn3D.cpp b/src/ElectroMagn/ElectroMagn3D.cpp index c8994d75c..41ba9cc58 100755 --- a/src/ElectroMagn/ElectroMagn3D.cpp +++ b/src/ElectroMagn/ElectroMagn3D.cpp @@ -4,7 +4,7 @@ #include #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif @@ -1207,7 +1207,7 @@ void ElectroMagn3D::centerMagneticFields() double *const __restrict__ Bz3D_m = Bz_m->data(); // Magnetic field Bx^(p,d,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofBx = Bx_->size(); const int sizeofBy = By_->size(); const int sizeofBz = Bz_->size(); @@ -1219,11 +1219,11 @@ void ElectroMagn3D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; idata(); double *const __restrict__ BzmBTIS3 = Bz_mBTIS3->data(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofByBTIS3 = By_mBTIS3->size(); #pragma acc parallel present(By3D[0:sizeofBy],BymBTIS3[0:sizeofByBTIS3]) #pragma acc loop gang @@ -1305,11 +1305,11 @@ void ElectroMagn3D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; isize(); #pragma acc parallel present(Bz3D[0:sizeofBz],BzmBTIS3[0:sizeofBzBTIS3]) #pragma acc loop gang @@ -1332,11 +1332,11 @@ void ElectroMagn3D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; icopyFrom( Br_[imode] ); if (input[2] && copy[2]) Bt_m[imode]->copyFrom( Bt_[imode] ); } - ElectroMagnAM *emAM = static_cast( patch->EMfields ); + // ElectroMagnAM *emAM = static_cast( patch->EMfields ); //emAM->compute_B_m_fromEB(); } @@ -1900,7 +1900,7 @@ void ElectroMagnAM::compute_B_m_fromEB() { const unsigned int nl_p = dimPrim[0]; const unsigned int nl_d = dimDual[0]; - const unsigned int nr_p = dimPrim[1]; + // const unsigned int nr_p = dimPrim[1]; const unsigned int nr_d = dimDual[1]; const unsigned int Nmodes = El_.size(); diff --git a/src/ElectroMagn/ElectroMagnAM.h b/src/ElectroMagn/ElectroMagnAM.h index 979581b4c..cd3063113 100755 --- a/src/ElectroMagn/ElectroMagnAM.h +++ b/src/ElectroMagn/ElectroMagnAM.h @@ -157,7 +157,7 @@ class ElectroMagnAM : public ElectroMagn void computeTotalRhoJ() override; -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // //! Method used to compute the total charge density and currents by summing over all species on Device // void computeTotalRhoJOnDevice() override ; // #endif diff --git a/src/ElectroMagnBC/ElectroMagnBC2D_SM.cpp b/src/ElectroMagnBC/ElectroMagnBC2D_SM.cpp index 42ce8c381..2d257cbd5 100755 --- a/src/ElectroMagnBC/ElectroMagnBC2D_SM.cpp +++ b/src/ElectroMagnBC/ElectroMagnBC2D_SM.cpp @@ -68,9 +68,9 @@ ElectroMagnBC2D_SM::ElectroMagnBC2D_SM( Params ¶ms, Patch *patch, unsigned i ElectroMagnBC2D_SM::~ElectroMagnBC2D_SM() { - for (int i=0 ; inumber_of_points_; const int sizeofE1 = E[1]->number_of_points_; const int sizeofE2 = E[2]->number_of_points_; @@ -182,7 +182,7 @@ void ElectroMagnBC2D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( db1, b1_size ); if( axis0_ == 0 ) { // for By^(d,p) -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E2[0:sizeofE2],B0[0:sizeofB0],B1[0:sizeofB1],B_ext1[0:B_ext_size1],B_ext0[0:B_ext_size0],db1[0:b1_size]) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -199,7 +199,7 @@ void ElectroMagnBC2D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * + B_ext1[j]; } } else { // for Bx^(p,d) -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E2[0:sizeofE2],B0[0:sizeofB0],B1[0:sizeofB1],B_ext1[0:B_ext_size1],B_ext0[0:B_ext_size0],db1[0:b1_size]) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -234,7 +234,7 @@ void ElectroMagnBC2D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * // for Bz^(d,d) if( axis0_ == 0 ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E1[0:sizeofE1],B2[0:sizeofB2],B_ext2[0:B_ext_size2],db2[0:b2_size]) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -247,7 +247,7 @@ void ElectroMagnBC2D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * } } else { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E0[0:sizeofE0],B2[0:sizeofB2],B_ext2[0:B_ext_size2],db2[0:b2_size]) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) diff --git a/src/ElectroMagnBC/ElectroMagnBC3D_SM.cpp b/src/ElectroMagnBC/ElectroMagnBC3D_SM.cpp index 3ae113e60..ba4e61b28 100755 --- a/src/ElectroMagnBC/ElectroMagnBC3D_SM.cpp +++ b/src/ElectroMagnBC/ElectroMagnBC3D_SM.cpp @@ -186,7 +186,7 @@ void ElectroMagnBC3D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * const int isBoundary2min = patch->isBoundary( axis2_, 0 ); const int isBoundary2max = patch->isBoundary( axis2_, 1 ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC const int sizeofE0 = E[axis0_]->number_of_points_; const int sizeofE1 = E[axis1_]->number_of_points_; const int sizeofE2 = E[axis2_]->number_of_points_; @@ -217,7 +217,7 @@ void ElectroMagnBC3D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * // B1 if( axis0_ == 0 ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E2[0:sizeofE2],B0[0:sizeofB0],B1[0:sizeofB1],B_ext1[0:B_ext_size1],B_ext0[0:B_ext_size0],db1[0:b1_size]) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -225,7 +225,7 @@ void ElectroMagnBC3D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int j=isBoundary1min; j( fields->Ex_ ); Field1D *Ey1D = static_cast( fields->Ey_ ); Field1D *Ez1D = static_cast( fields->Ez_ ); - Field1D *Bx1D = static_cast( fields->Bx_ ); + // Field1D *Bx1D = static_cast( fields->Bx_ ); Field1D *By1D = static_cast( fields->By_ ); Field1D *Bz1D = static_cast( fields->Bz_ ); Field1D *Jx1D = static_cast( fields->Jx_ ); diff --git a/src/ElectroMagnSolver/MA_Solver2D_norm.cpp b/src/ElectroMagnSolver/MA_Solver2D_norm.cpp index d12e021c1..4cd0d7d7c 100755 --- a/src/ElectroMagnSolver/MA_Solver2D_norm.cpp +++ b/src/ElectroMagnSolver/MA_Solver2D_norm.cpp @@ -37,7 +37,7 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) // double sumJz = 0; // Electric field Ex^(d,p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofEx = fields->Ex_->number_of_points_; const int sizeofEy = fields->Ey_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; @@ -52,10 +52,10 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_d; ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_p; ++y ) { @@ -64,7 +64,7 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) } // Electric field Ey^(p,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( Ey2D[0:sizeofEy], Jy2D[0:sizeofEy], Bz2D[0:sizeofBz] ) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -72,10 +72,10 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_p; ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_d; ++y ) { @@ -84,7 +84,7 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) } // Electric field Ez^(p,p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( Ez2D[0:sizeofEz], Jz2D[0:sizeofEz], Bx2D[0:sizeofBx], By2D[0:sizeofBy] ) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -92,10 +92,10 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_p; ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_p; ++y ) { diff --git a/src/ElectroMagnSolver/MA_Solver3D_norm.cpp b/src/ElectroMagnSolver/MA_Solver3D_norm.cpp index 9b2a089cc..7ffea26c0 100755 --- a/src/ElectroMagnSolver/MA_Solver3D_norm.cpp +++ b/src/ElectroMagnSolver/MA_Solver3D_norm.cpp @@ -35,7 +35,7 @@ void MA_Solver3D_norm::operator()( ElectroMagn *fields ) const unsigned int nz_d = fields->dimDual[2]; // Electric field Ex^(d,p,p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofEx = fields->Ex_->number_of_points_; const int sizeofEy = fields->Ey_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; @@ -50,11 +50,11 @@ void MA_Solver3D_norm::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; iBz_->data(); // [x * ny_d + y] : dual in x,y primal in z // Magnetic field Bx^(p,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofEx = fields->Ex_->number_of_points_; const int sizeofEy = fields->Ey_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; @@ -48,10 +48,10 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_d - 1; ++x ) { -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int y = 1; y < ny_d - 1; ++y ) { @@ -59,7 +59,7 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) } } // Magnetic field By^(d,p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( By2D[0:sizeofBy], Ez2D[0:sizeofEz] ) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -67,10 +67,10 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 1; x < nx_d - 1; ++x ) { -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int y = 0; y < ny_p; ++y ) { @@ -79,7 +79,7 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) } // Magnetic field Bz^(d,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( Bz2D[0:sizeofBy], Ex2D[0:sizeofEx], Ey2D[0:sizeofEz] ) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -87,10 +87,10 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 1; x < nx_d - 1; ++x ) { -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int y = 1; y < ny_d - 1; ++y ) { diff --git a/src/ElectroMagnSolver/MF_Solver3D_Yee.cpp b/src/ElectroMagnSolver/MF_Solver3D_Yee.cpp index 5930af3e1..f70159699 100755 --- a/src/ElectroMagnSolver/MF_Solver3D_Yee.cpp +++ b/src/ElectroMagnSolver/MF_Solver3D_Yee.cpp @@ -34,7 +34,7 @@ void MF_Solver3D_Yee::operator()( ElectroMagn *fields ) const double * __restrict__ Ez3D = isEFilterApplied ? fields->filter_->Ez_[0]->data() : fields->Ez_->data(); // Magnetic field Bx^(p,d,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofEx = fields->Ex_->number_of_points_; const int sizeofEy = fields->Ey_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; @@ -49,11 +49,11 @@ void MF_Solver3D_Yee::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; i dA_over_dx_fdtd = ( ( *A_n_pml )( i+1, j )-( *A_n_pml )( i-1, j ) )/(2.*dx) ; - std::complex dA_over_dx = dA_over_dx_fdtd - + i1*k0*( *A_n_pml )( i, j ) ; + // std::complex dA_over_dx = dA_over_dx_fdtd + // + i1*k0*( *A_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A std::complex d2A_over_dx2_fdtd = ( ( *A_n_pml )( i-1, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+1, j ) )/(dx*dx) ; std::complex d2A_over_dx2 = d2A_over_dx2_fdtd @@ -590,8 +590,8 @@ void PML_Solver2D_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, // ---- // dA/dx = dA/dx + ik0 A std::complex dA_over_dx_fdtd = ( ( *A_n_pml )( i+1, j )-( *A_n_pml )( i-1, j ) )/(2.*dx) ; - std::complex dA_over_dx = dA_over_dx_fdtd - + i1*k0*( *A_n_pml )( i, j ) ; + // std::complex dA_over_dx = dA_over_dx_fdtd + // + i1*k0*( *A_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A std::complex d2A_over_dx2_fdtd = ( ( *A_n_pml )( i-1, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+1, j ) )/(dx*dx) ; std::complex d2A_over_dx2 = d2A_over_dx2_fdtd diff --git a/src/ElectroMagnSolver/PML_SolverAM_Envelope.cpp b/src/ElectroMagnSolver/PML_SolverAM_Envelope.cpp index 7e4e740c7..d8c65645a 100644 --- a/src/ElectroMagnSolver/PML_SolverAM_Envelope.cpp +++ b/src/ElectroMagnSolver/PML_SolverAM_Envelope.cpp @@ -395,7 +395,6 @@ void PML_SolverAM_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, double k0 = 1.; // laser wavenumber std::complex source_term_x ; std::complex source_term_y ; - double mpml_ratio = 0.00; if (iDim == 0) { for( unsigned int k=0 ; k<1 ; k++ ) { @@ -405,7 +404,7 @@ void PML_SolverAM_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, // dA/dx = dA/dx + ik0 A // r dA/dx = r dA/dx + ik0 rA <=> dG/dx = dG/dx + ik0 G std::complex dG_over_dx_fdtd = ( ( *G_n_pml )( i+1, j )-( *G_n_pml )( i-1, j ) )/(2.*dl) ; - std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; + // std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2G_over_dx2_fdtd = ( ( *G_n_pml )( i-1, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+1, j ) )/(dl*dl) ; @@ -494,7 +493,7 @@ void PML_SolverAM_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, // dA/dx = dA/dx + ik0 A // r dA/dx = r dA/dx + ik0 rA <=> dG/dx = dG/dx + ik0 G std::complex dA_over_dx_fdtd = ( ( *A_n_pml )( i+1, j )-( *A_n_pml )( i-1, j ) )/(2.*dl) ; - std::complex dA_over_dx = dA_over_dx_fdtd + i1*k0*( *A_n_pml )( i, j ) ; + // std::complex dA_over_dx = dA_over_dx_fdtd + i1*k0*( *A_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2A_over_dx2_fdtd = ( ( *A_n_pml )( i-1, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+1, j ) )/(dl*dl) ; @@ -635,8 +634,8 @@ void PML_SolverAM_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, for( unsigned int j=solvermin ; j < solvermax ; j++ ) { // y loop // r dA/dx = r dA/dx + ik0 rA <=> dG/dx = dG/dx + ik0 G std::complex dG_over_dx_fdtd = ( ( *G_n_pml )( i+1, j )-( *G_n_pml )( i-1, j ) )/(2.*dl) ; - std::complex dG_over_dx = dG_over_dx_fdtd - + i1*k0*( *G_n_pml )( i, j ) ; + // std::complex dG_over_dx = dG_over_dx_fdtd + // + i1*k0*( *G_n_pml )( i, j ) ; // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2G_over_dx2_fdtd = ( ( *G_n_pml )( i-1, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+1, j ) )/(dl*dl) ; std::complex d2G_over_dx2 = d2G_over_dx2_fdtd diff --git a/src/ElectroMagnSolver/PML_SolverAM_EnvelopeReducedDispersion.cpp b/src/ElectroMagnSolver/PML_SolverAM_EnvelopeReducedDispersion.cpp index 771f12e37..c2a5c4087 100644 --- a/src/ElectroMagnSolver/PML_SolverAM_EnvelopeReducedDispersion.cpp +++ b/src/ElectroMagnSolver/PML_SolverAM_EnvelopeReducedDispersion.cpp @@ -400,7 +400,6 @@ void PML_SolverAM_EnvelopeReducedDispersion::compute_A_from_G( LaserEnvelope *en double k0 = 1.; // laser wavenumber std::complex source_term_x ; std::complex source_term_y ; - double mpml_ratio = 0.00; if (iDim == 0) { for( unsigned int k=0 ; k<1 ; k++ ) { @@ -410,7 +409,7 @@ void PML_SolverAM_EnvelopeReducedDispersion::compute_A_from_G( LaserEnvelope *en // dA/dx = dA/dx + ik0 A // r dA/dx = r dA/dx + ik0 rA <=> dG/dx = dG/dx + ik0 G std::complex dG_over_dx_fdtd = (1.+delta)*( ( *G_n_pml )( i+1, j )-( *G_n_pml )( i-1, j ) )/(2.*dl) - delta*( ( *G_n_pml )( i+2, j )-( *G_n_pml )( i-2, j ) )/(4.*dl) ; - std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; + // std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2G_over_dx2_fdtd = (1.+delta)*( ( *G_n_pml )( i-1, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+1, j ) )/(dl*dl)-delta*( ( *G_n_pml )( i-2, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+2, j ) )/(4.*dl*dl) ; @@ -490,7 +489,7 @@ void PML_SolverAM_EnvelopeReducedDispersion::compute_A_from_G( LaserEnvelope *en for( unsigned int i=solvermin ; i dA_over_dx_fdtd = (1.+delta)*( ( *A_n_pml )( i+1, j )-( *A_n_pml )( i-1, j ) )/(2.*dl) - delta*( ( *A_n_pml )( i+2, j )-( *A_n_pml )( i-2, j ) )/(4.*dl) ; - std::complex dA_over_dx = dA_over_dx_fdtd + i1*k0*( *A_n_pml )( i, j ) ; + // std::complex dA_over_dx = dA_over_dx_fdtd + i1*k0*( *A_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2A_over_dx2_fdtd = (1.+delta)*( ( *A_n_pml )( i-1, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+1, j ) )/(dl*dl)-delta*( ( *A_n_pml )( i-2, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+2, j ) )/(4.*dl*dl) ; @@ -591,7 +590,7 @@ void PML_SolverAM_EnvelopeReducedDispersion::compute_A_from_G( LaserEnvelope *en for( unsigned int i=2 ; i dG_over_dx_fdtd = (1.+delta)*( ( *G_n_pml )( i+1, j )-( *G_n_pml )( i-1, j ) )/(2.*dl) - delta*( ( *G_n_pml )( i+2, j )-( *G_n_pml )( i-2, j ) )/(4.*dl) ; - std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; + // std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2G_over_dx2_fdtd = (1.+delta)*( ( *G_n_pml )( i-1, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+1, j ) )/(dl*dl)-delta*( ( *G_n_pml )( i-2, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+2, j ) )/(4.*dl*dl) ; diff --git a/src/Field/Field.cpp b/src/Field/Field.cpp index 19c820d1d..0d8427f1e 100644 --- a/src/Field/Field.cpp +++ b/src/Field/Field.cpp @@ -5,14 +5,14 @@ void Field::put_to( double val ) { SMILEI_ASSERT( data_ != nullptr ); -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) const bool is_hostptr_mapped_on_device = smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( data_ ); #endif // NVCC's OpenACC needs that redundant pointer value double* an_other_data_pointer = data_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) // Test if data exists on GPU, put_to can be used on CPU and GPU during a simulation #pragma acc parallel present( an_other_data_pointer [0:size()] ) if( is_hostptr_mapped_on_device ) #pragma acc loop gang worker vector @@ -25,7 +25,7 @@ void Field::put_to( double val ) } } -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) //! copy the field array from Host to Device void Field::copyFromHostToDevice() { diff --git a/src/Field/Field.h b/src/Field/Field.h index 669106245..563705ab1 100755 --- a/src/Field/Field.h +++ b/src/Field/Field.h @@ -188,7 +188,7 @@ class Field virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) = 0; -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) //! Compute the norm2OnDevice of the field virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) = 0; #endif @@ -234,7 +234,7 @@ class Field return sum; } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) inline double __attribute__((always_inline)) normOnDevice() { @@ -245,7 +245,7 @@ class Field #pragma omp target teams distribute parallel for \ map(tofrom: sum) map(to: number_of_points_) \ reduction(+:sum) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(field) //deviceptr( data_ ) #pragma acc loop gang worker vector reduction(+:sum) #endif @@ -279,7 +279,7 @@ class Field virtual void extract_fields_sum ( int iDim, int iNeighbor, int ghost_size ) = 0; virtual void inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) = 0; -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) //! copy the field from Host to Device void copyFromHostToDevice(); diff --git a/src/Field/Field1D.cpp b/src/Field/Field1D.cpp index d0fa18b2f..59f085f81 100755 --- a/src/Field/Field1D.cpp +++ b/src/Field/Field1D.cpp @@ -190,7 +190,7 @@ double Field1D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double Field1D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { ERROR("Not implemented"); diff --git a/src/Field/Field1D.h b/src/Field/Field1D.h index 0ff09cd1e..228cc586f 100755 --- a/src/Field/Field1D.h +++ b/src/Field/Field1D.h @@ -92,7 +92,7 @@ class Field1D : public Field virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override final; #endif diff --git a/src/Field/Field2D.cpp b/src/Field/Field2D.cpp index a089a0d45..94051fed6 100755 --- a/src/Field/Field2D.cpp +++ b/src/Field/Field2D.cpp @@ -71,7 +71,7 @@ Field2D::~Field2D() for (int iside=0 ; iside<(int)(sendFields_.size()) ; iside++ ) { if ( sendFields_[iside] != NULL ) { -#if defined ( SMILEI_ACCELERATOR_MODE ) +#if defined ( SMILEI_ACCELERATOR_GPU ) if ( sendFields_[iside]->isOnDevice() ) { sendFields_[iside]->deleteOnDevice(); @@ -220,7 +220,7 @@ double Field2D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double Field2D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { @@ -247,7 +247,7 @@ double Field2D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3 map(to: ny, idxlocalstart[0], idxlocalstart[1], iystart, iyend) \ /* is_device_ptr( data_ )*/ \ reduction(+:nrj) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(field) //deviceptr( data_ ) #pragma acc loop gang worker vector collapse(2) reduction(+:nrj) #endif @@ -333,7 +333,7 @@ void Field2D::create_sub_fields( int iDim, int iNeighbor, int ghost_size ) sendFields_[iDim*2+iNeighbor] = new Field2D(size); recvFields_[iDim*2+iNeighbor] = new Field2D(size); -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) if( ( name[0] == 'B' ) || ( name[0] == 'J' || name[0] == 'R' ) ) { sendFields_[iDim * 2 + iNeighbor]->allocateAndCopyFromHostToDevice(); recvFields_[iDim * 2 + iNeighbor]->allocateAndCopyFromHostToDevice(); @@ -341,7 +341,7 @@ void Field2D::create_sub_fields( int iDim, int iNeighbor, int ghost_size ) #endif } else if ( ghost_size != (int)(sendFields_[iDim*2+iNeighbor]->dims_[iDim]) ) { -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) ERROR( "To Do GPU : envelope" ); #endif delete sendFields_[iDim*2+iNeighbor]; @@ -381,7 +381,7 @@ void Field2D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) #pragma omp target if( should_manipulate_gpu_memory ) #pragma omp teams distribute parallel for collapse( 2 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) const int subSize = sendFields_[iDim*2+iNeighbor]->size(); const int fSize = number_of_points_; bool fieldName( (name.substr(0,1) == "B") ); @@ -389,7 +389,7 @@ void Field2D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; isize(); const int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "B" ); @@ -437,7 +437,7 @@ void Field2D::inject_fields_exch ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; isize(); const int fSize = number_of_points_; bool fieldName( ((name.substr(0,1) == "J") || (name.substr(0,1) == "R") ) && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub )); @@ -486,7 +486,7 @@ void Field2D::extract_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; isize(); int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "J" || name.substr(0,1) == "R"); @@ -535,7 +535,7 @@ void Field2D::inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif @@ -81,7 +81,7 @@ Field3D::~Field3D() for( unsigned int iside=0 ; isideisOnDevice() ) { @@ -102,7 +102,9 @@ Field3D::~Field3D() } } if( data_!=NULL ) { +#if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete (data_[0:number_of_points_]) if (acc_deviceptr(data_) != NULL) +#endif delete [] data_; for( unsigned int i=0; idata_3D[i]; @@ -248,7 +250,7 @@ double Field3D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } // Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double Field3D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { double nrj( 0. ); @@ -277,7 +279,7 @@ double Field3D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3 map(to: ny, nz, ixstart, ixend, iystart, iyend, izstart, izend) \ /*is_device_ptr( data_ ) */ \ reduction(+:nrj) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(field[0:number_of_points_]) //deviceptr( data_ ) #pragma acc loop gang worker vector collapse(3) reduction(+:nrj) #endif @@ -405,7 +407,7 @@ void Field3D::create_sub_fields ( int iDim, int iNeighbor, int ghost_size ) sendFields_[iDim*2+iNeighbor] = new Field3D(size); recvFields_[iDim*2+iNeighbor] = new Field3D(size); -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) if( ( name[0] == 'B' ) || ( name[0] == 'J' || name[0] == 'R' ) ) { @@ -427,7 +429,7 @@ void Field3D::create_sub_fields ( int iDim, int iNeighbor, int ghost_size ) } else if( ghost_size != (int) sendFields_[iDim*2+iNeighbor]->dims_[iDim] ) { -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) ERROR( "To Do GPU : envelope" ); #endif delete sendFields_[iDim*2+iNeighbor]; @@ -463,7 +465,7 @@ void Field3D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) #pragma omp target if( is_the_right_field ) #pragma omp teams distribute parallel for collapse( 3 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) const int subSize = sendFields_[iDim*2+iNeighbor]->size(); const int fSize = number_of_points_; bool fieldName( (name.substr(0,1) == "B") ); @@ -471,11 +473,11 @@ void Field3D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i<(unsigned int)NX; i++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif for( unsigned int j=0; j<(unsigned int)NY; j++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int k=0; k<(unsigned int)NZ; k++ ) { @@ -514,7 +516,7 @@ void Field3D::inject_fields_exch ( int iDim, int iNeighbor, int ghost_size ) map( tofrom \ : field [0:fSize] ) #pragma omp teams distribute parallel for collapse( 3 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); const int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "B" ); @@ -522,11 +524,11 @@ void Field3D::inject_fields_exch ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i<(unsigned int)NX; i++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif for( unsigned int j=0; j<(unsigned int)NY; j++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int k=0; k<(unsigned int)NZ; k++ ) { @@ -566,7 +568,7 @@ void Field3D::extract_fields_sum ( int iDim, int iNeighbor, int ghost_size ) map( to \ : field [0:fSize] ) #pragma omp teams distribute parallel for collapse( 3 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) const int subSize = sendFields_[iDim*2+iNeighbor]->size(); const int fSize = number_of_points_; bool fieldName( (name.substr(0,1) == "J") || (name.substr(0,1) == "R")); @@ -575,11 +577,11 @@ void Field3D::extract_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i<(unsigned int)NX; i++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif for( unsigned int j=0; j<(unsigned int)NY; j++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int k=0; k<(unsigned int)NZ; k++ ) { @@ -618,7 +620,7 @@ void Field3D::inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) map( tofrom \ : field [0:fSize] ) #pragma omp teams distribute parallel for collapse( 3 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "J" || name.substr(0,1) == "R"); @@ -627,11 +629,11 @@ void Field3D::inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i<(unsigned int)NX; i++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif for( unsigned int j=0; j<(unsigned int)NY; j++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int k=0; k<(unsigned int)NZ; k++ ) { diff --git a/src/Field/Field3D.h b/src/Field/Field3D.h index cc9524790..9f9ce4c9a 100755 --- a/src/Field/Field3D.h +++ b/src/Field/Field3D.h @@ -100,7 +100,7 @@ class Field3D : public Field virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override final; #endif diff --git a/src/Field/cField.h b/src/Field/cField.h index c37aa9514..d76de6ed7 100755 --- a/src/Field/cField.h +++ b/src/Field/cField.h @@ -63,7 +63,7 @@ class cField : public Field virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override = 0; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) = 0; #endif diff --git a/src/Field/cField1D.cpp b/src/Field/cField1D.cpp index 77b0c2685..6a79da95a 100755 --- a/src/Field/cField1D.cpp +++ b/src/Field/cField1D.cpp @@ -191,7 +191,7 @@ double cField1D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double cField1D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { ERROR("Not implemented"); diff --git a/src/Field/cField1D.h b/src/Field/cField1D.h index 43f2030e3..27b15bfc1 100755 --- a/src/Field/cField1D.h +++ b/src/Field/cField1D.h @@ -94,7 +94,7 @@ class cField1D : public cField virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override final; #endif diff --git a/src/Field/cField2D.cpp b/src/Field/cField2D.cpp index e1ca5560a..57ff6ea81 100755 --- a/src/Field/cField2D.cpp +++ b/src/Field/cField2D.cpp @@ -219,7 +219,7 @@ double cField2D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double cField2D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { ERROR("Not implemented"); diff --git a/src/Field/cField2D.h b/src/Field/cField2D.h index d447d4f2e..26ee995c9 100755 --- a/src/Field/cField2D.h +++ b/src/Field/cField2D.h @@ -84,7 +84,7 @@ class cField2D : public cField virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override final; #endif diff --git a/src/Field/cField3D.cpp b/src/Field/cField3D.cpp index 84510f401..f4249e134 100755 --- a/src/Field/cField3D.cpp +++ b/src/Field/cField3D.cpp @@ -218,7 +218,7 @@ double cField3D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double cField3D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { ERROR("Not implemented"); diff --git a/src/Field/cField3D.h b/src/Field/cField3D.h index a81f293fc..0db1f6835 100755 --- a/src/Field/cField3D.h +++ b/src/Field/cField3D.h @@ -84,7 +84,7 @@ class cField3D : public cField virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override final; #endif diff --git a/src/Interpolator/Interpolator2D2Order.cpp b/src/Interpolator/Interpolator2D2Order.cpp index 0254294f5..795ab996d 100755 --- a/src/Interpolator/Interpolator2D2Order.cpp +++ b/src/Interpolator/Interpolator2D2Order.cpp @@ -180,7 +180,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, const double *const __restrict__ By2D = static_cast( EMfields->By_m )->data(); const double *const __restrict__ Bz2D = static_cast( EMfields->Bz_m )->data(); -#if defined(SMILEI_OPENACC_MODE) +#if defined(SMILEI_ACCELERATOR_GPU_OACC) const int sizeofEx = EMfields->Ex_->size(); const int sizeofEy = EMfields->Ey_->size(); const int sizeofEz = EMfields->Ez_->size(); @@ -207,7 +207,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, position_x /* [first_index:npart_range_size] */, \ position_y /* [first_index:npart_range_size] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 1 * nparts ) - first_index; @@ -260,7 +260,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, delta[1*nparts+ipart] = delta_p[1]; } - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif } else{ // with B-TIS3 interpolation @@ -276,7 +276,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, position_x /* [first_index:npart_range_size] */, \ position_y /* [first_index:npart_range_size] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 1 * nparts ) - first_index; @@ -337,7 +337,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, delta[1*nparts+ipart] = delta_p[1]; } // end ipart loop - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif } // end with B-TIS interpolation diff --git a/src/Interpolator/Interpolator3D2Order.cpp b/src/Interpolator/Interpolator3D2Order.cpp index 9e594f20b..f40239836 100755 --- a/src/Interpolator/Interpolator3D2Order.cpp +++ b/src/Interpolator/Interpolator3D2Order.cpp @@ -185,8 +185,6 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part int *const __restrict__ iold = smpi->dynamics_iold[ithread].data(); double *const __restrict__ delta = smpi->dynamics_deltaold[ithread].data(); - unsigned int buffer_size = smpi->dynamics_Epart[ithread].size(); - const double *const __restrict__ position_x = particles.getPtrPosition( 0 ); const double *const __restrict__ position_y = particles.getPtrPosition( 1 ); const double *const __restrict__ position_z = particles.getPtrPosition( 2 ); @@ -198,7 +196,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part const double *const __restrict__ By3D = EMfields->By_m->data_; const double *const __restrict__ Bz3D = EMfields->Bz_m->data_; -#if defined(SMILEI_OPENACC_MODE) +#if defined(SMILEI_ACCELERATOR_GPU_OACC) const int sizeofEx = EMfields->Ex_->size(); const int sizeofEy = EMfields->Ey_->size(); const int sizeofEz = EMfields->Ez_->size(); @@ -224,7 +222,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part position_y /* [first_index:npart_range_size] */, \ position_z /* [first_index:npart_range_size] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 2 * nparts ) - first_index; @@ -282,7 +280,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part delta[1*nparts+ipart] = delta_p[1]; delta[2*nparts+ipart] = delta_p[2]; } - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif } else { // with B-TIS3 interpolation @@ -302,7 +300,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part position_y /* [first_index:npart_range_size] */, \ position_z /* [first_index:npart_range_size] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 2 * nparts ) - first_index; @@ -368,7 +366,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part delta[ipart+0*nparts] = delta_p[0]; delta[ipart+1*nparts] = delta_p[1]; delta[ipart+2*nparts] = delta_p[2]; - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif } // end ipart loop diff --git a/src/Interpolator/Interpolator3D2Order.h b/src/Interpolator/Interpolator3D2Order.h index 52f0335a0..1fa07438d 100755 --- a/src/Interpolator/Interpolator3D2Order.h +++ b/src/Interpolator/Interpolator3D2Order.h @@ -59,7 +59,7 @@ class Interpolator3D2Order : public Interpolator3D int idx, int idy, int idz, - int nx, + int /*nx*/, int ny, int nz ) { diff --git a/src/MovWindow/SimWindow.cpp b/src/MovWindow/SimWindow.cpp index 6dbb5da57..4ee9781c7 100755 --- a/src/MovWindow/SimWindow.cpp +++ b/src/MovWindow/SimWindow.cpp @@ -383,7 +383,7 @@ void SimWindow::shift( VectorPatch &vecPatches, SmileiMPI *smpi, Params ¶ms, } // end loop nSpecies -#if defined ( SMILEI_ACCELERATOR_MODE ) +#if defined ( SMILEI_ACCELERATOR_GPU ) if( params.gpu_computing ) { for( auto spec: mypatch->vecSpecies ) { spec->allocateParticlesOnDevice(); @@ -398,7 +398,7 @@ void SimWindow::shift( VectorPatch &vecPatches, SmileiMPI *smpi, Params ¶ms, } // end test patch_particle_created[ithread][j] -#if defined ( SMILEI_ACCELERATOR_MODE ) +#if defined ( SMILEI_ACCELERATOR_GPU ) // if ( params.gpu_computing ) { // Initializes only field data structures, particle data structure are initialized separately mypatch->allocateAndCopyFieldsOnDevice(); diff --git a/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.cpp b/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.cpp index 6f7b9e0df..8136f36ff 100755 --- a/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.cpp +++ b/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.cpp @@ -10,7 +10,7 @@ #include "MultiphotonBreitWheeler.h" #include "Species.h" -#if defined(SMILEI_OPENACC_MODE) +#if defined(SMILEI_ACCELERATOR_GPU_OACC) #define __HIP_PLATFORM_NVCC__ #define __HIP_PLATFORM_NVIDIA__ #include "gpuRandom.h" @@ -248,7 +248,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, double *const __restrict__ pair1_chi = new_pair[1]->has_quantum_parameter ? new_pair[1]->getPtrChi() : nullptr; double *const __restrict__ pair1_tau = new_pair[1]->has_Monte_Carlo_process ? new_pair[1]->getPtrTau() : nullptr; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // Parameters for random generator unsigned long long seed; unsigned long long seq; @@ -325,7 +325,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, Ex[ipart-ipart_ref], Ey[ipart-ipart_ref], Ez[ipart-ipart_ref], Bx[ipart-ipart_ref], By[ipart-ipart_ref], Bz[ipart-ipart_ref] ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC } @@ -349,7 +349,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, while( tau[ipart] <= epsilon_tau_ ) { //tau[ipart] = -log( 1.-Rand::uniform() ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC tau[ipart] = -std::log( 1.-rand_->uniform() ); #else @@ -406,7 +406,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, double pair_chi[2]; // Draw random number in [0,1[ -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC const double random_number = rand_->uniform(); #else seed_curand_2 = (int) (ipart + 1)*(initial_seed_2 + 1); //Seed for linear generator @@ -431,7 +431,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, SMILEI_UNUSED( ibin ); // Creation of new electrons in the temporary array new_pair[0] new_pair[0]->createParticles( mBW_pair_creation_sampling_[0] ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Final size int nparticles = new_pair[0]->size(); @@ -442,7 +442,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, #endif // For all new paticles -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #endif for( int ipair=i_pair_start; ipair < i_pair_start+mBW_pair_creation_sampling_[0]; ipair++ ) { @@ -466,7 +466,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, } // + new_pair[k].momentum(i,ipair)*remaining_dt*inv_gamma; -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Old positions if( particles.keepOldPositions() ) { pair0_position_old_x[ipair]=position_x[ipart] ; @@ -494,7 +494,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, // Create particle for the second pair species new_pair[1]->createParticles( mBW_pair_creation_sampling_[1] ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Final size nparticles = new_pair[1]->size(); @@ -505,7 +505,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, #endif // For all new paticles -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #endif for( auto ipair=i_pair_start; ipair < i_pair_start + mBW_pair_creation_sampling_[1]; ipair++ ) { @@ -530,7 +530,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, } // + new_pair[k].momentum(i,ipair)*remaining_dt*inv_gamma; -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Old positions if( particles.keepOldPositions() ) { pair1_position_old_x[ipair]=position_x[ipart] ; @@ -629,7 +629,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, } } // end ipart loop -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } #endif } @@ -795,7 +795,7 @@ void MultiphotonBreitWheeler::removeDecayedPhotonsWithoutBinCompression( if( ipart < last_photon_index ) { // The last existing photon comes to the position of // the deleted photon -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC particles.overwriteParticle( last_photon_index, ipart ); #else #endif diff --git a/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.h b/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.h index 6e14a37f3..71315d79a 100755 --- a/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.h +++ b/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.h @@ -115,7 +115,7 @@ class MultiphotonBreitWheeler //! \param bmin Pointer toward the first particle index of the bin in the Particles object //! \param bmax Pointer toward the last particle index of the bin in the Particles object //! \param ithread Thread index -//#ifdef SMILEI_OPENACC_MODE +//#ifdef SMILEI_ACCELERATOR_GPU_OACC // #pragma acc routine seq //#endif void removeDecayedPhotonsWithoutBinCompression( diff --git a/src/MultiphotonBreitWheeler/MultiphotonBreitWheelerTables.h b/src/MultiphotonBreitWheeler/MultiphotonBreitWheelerTables.h index 4f7f1ce72..9bef108b6 100755 --- a/src/MultiphotonBreitWheeler/MultiphotonBreitWheelerTables.h +++ b/src/MultiphotonBreitWheeler/MultiphotonBreitWheelerTables.h @@ -54,7 +54,7 @@ class MultiphotonBreitWheelerTables //! the multiphoton Breit-Wheeler pair creation //! \param photon_chi photon quantum parameter //! \param[out] pair_chi quantum parameters of the pair -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif void computePairQuantumParameter( const double photon_chi, @@ -71,7 +71,7 @@ class MultiphotonBreitWheelerTables //! \param photon_chi photon quantum parameter //! \param gamma photon normalized energy // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double computeBreitWheelerPairProductionRate( diff --git a/src/Params/Params.cpp b/src/Params/Params.cpp index 803cdf9e5..b1fafcb09 100755 --- a/src/Params/Params.cpp +++ b/src/Params/Params.cpp @@ -837,7 +837,7 @@ Params::Params( SmileiMPI *smpi, std::vector namelistsFiles ) : PyTools::extract( "gpu_computing", gpu_computing, "Main" ); if( gpu_computing ) { -#if( defined( SMILEI_OPENACC_MODE ) && defined( _OPENACC ) ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if( defined( SMILEI_ACCELERATOR_GPU_OACC ) && defined( _OPENACC ) ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) // If compiled for GPU and asking for GPU MESSAGE( 1, "Smilei will run on GPU devices" ); #else @@ -1055,21 +1055,21 @@ Params::Params( SmileiMPI *smpi, std::vector namelistsFiles ) : // Extract the list of profiles and verify their content PyObject *p = PyTools::extract_py( "_profiles", "Laser", i_laser ); vector profiles; - vector profiles_n = {1, 2}; if( ! PyTools::py2pyvector( p, profiles ) ) { ERROR_NAMELIST( "For LaserOffset #" << n_laser_offset << ": space_time_profile must be a list of 2 profiles", LINK_NAMELIST + std::string("#lasers") ); } Py_DECREF( p ); - if( profiles.size()!=2 ) { + if( profiles.size() != 2 ) { ERROR_NAMELIST( "For LaserOffset #" << n_laser_offset << ": space_time_profile needs 2 profiles.", LINK_NAMELIST + std::string("#lasers") ); } - if( profiles[1] == Py_None ) { - profiles .pop_back(); - profiles_n.pop_back(); - } - if( profiles[0] == Py_None ) { - profiles .erase( profiles .begin() ); - profiles_n.erase( profiles_n.begin() ); + vector profiles_n; + for( unsigned int i = 0; i < 2; i++ ) { + if( profiles[i] == Py_None ) { + Py_DECREF( profiles[i] ); + profiles.erase( profiles.begin() ); + } else { + profiles_n.push_back( i ); + } } if( profiles.size() == 0 ) { ERROR_NAMELIST( "For LaserOffset #" << n_laser_offset << ": space_time_profile cannot be [None, None]", LINK_NAMELIST + std::string("#lasers") ); @@ -1124,7 +1124,11 @@ Params::Params( SmileiMPI *smpi, std::vector namelistsFiles ) : propagateX( profiles, profiles_n, offset, file, keep_n_strongest_modes, angle_z ); } } - + + for( auto p: profiles ) { + Py_DECREF( p ); + } + n_laser_offset ++; } } @@ -1227,7 +1231,7 @@ void Params::compute() // Set cluster_width_ if not set by the user if( cluster_width_ == -1 ) { -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) cluster_width_ = patch_size_[0]; // On GPU, dont do the CPU automatic cluster_width computation, only one // bin is expected. @@ -1276,7 +1280,7 @@ void Params::compute() // Verify that cluster_width_ divides patch_size_[0] or patch_size_[n] in GPU mode -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) const int kClusterWidth = getGPUClusterWidth(); if( kClusterWidth < 0 ) { @@ -1886,7 +1890,7 @@ string Params::speciesField( string field_name ) return ""; } -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) bool Params::isGPUParticleBinningAvailable() const { @@ -1903,7 +1907,7 @@ bool Params::isGPUParticleBinningAvailable() const #endif -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) int Params::getGPUClusterWidth() const { diff --git a/src/Params/Params.h b/src/Params/Params.h index e2b0603e6..32bf63a37 100755 --- a/src/Params/Params.h +++ b/src/Params/Params.h @@ -386,7 +386,7 @@ class Params //! bool isGPUParticleBinningAvailable() const; -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) //! Given dimension_id in [0, 3), return for dimension_id == : //! 1: the 1D value (not implemented) diff --git a/src/ParticleBC/BoundaryConditionType.cpp b/src/ParticleBC/BoundaryConditionType.cpp index 5a55d74b2..304656eca 100755 --- a/src/ParticleBC/BoundaryConditionType.cpp +++ b/src/ParticleBC/BoundaryConditionType.cpp @@ -18,7 +18,7 @@ void internal_inf( Species *species, int imin, int imax, int direction, double l energy_change = 0.; // no energy loss during exchange const double* const position = species->particles->getPtrPosition( direction ); int* const cell_keys = species->particles->getPtrCellKeys(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(position,cell_keys) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -40,7 +40,7 @@ void internal_sup( Species *species, int imin, int imax, int direction, double l energy_change = 0.; // no energy loss during exchange const double* const position = species->particles->getPtrPosition( direction ); int* const cell_keys = species->particles->getPtrCellKeys(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(position,cell_keys) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -92,7 +92,7 @@ void reflect_particle_inf( Species *species, int imin, int imax, int direction, energy_change = 0.; // no energy loss during reflection double* position = species->particles->getPtrPosition(direction); double* momentum = species->particles->getPtrMomentum(direction); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel deviceptr(position,momentum) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -112,7 +112,7 @@ void reflect_particle_sup( Species *species, int imin, int imax, int direction, energy_change = 0.; // no energy loss during reflection double* position = species->particles->getPtrPosition(direction); double* momentum = species->particles->getPtrMomentum(direction); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel deviceptr(position,momentum) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -189,9 +189,9 @@ void remove_particle_inf( Species* species, int imin, int imax, int direction, double limit_inf, - double dt, - std::vector& invgf, - Random* rand, + double /*dt*/, + std::vector& /*invgf*/, + Random* /*rand*/, double& energy_change ) { @@ -210,7 +210,7 @@ void remove_particle_inf( Species* species, : change_in_energy ) #pragma omp teams distribute parallel for reduction( + \ : change_in_energy ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(position,momentum_x,momentum_y,momentum_z,weight,charge,cell_keys) #pragma acc loop gang worker vector reduction(+ : change_in_energy) #else @@ -235,9 +235,9 @@ void remove_particle_sup( Species* species, int imin, int imax, int direction, double limit_sup, - double dt, - std::vector& invgf, - Random* rand, + double /*dt*/, + std::vector& /*invgf*/, + Random* /*rand*/, double& energy_change ) { @@ -256,7 +256,7 @@ void remove_particle_sup( Species* species, : change_in_energy ) #pragma omp teams distribute parallel for reduction( + \ : change_in_energy ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(position,momentum_x,momentum_y,momentum_z,weight,charge,cell_keys) #pragma acc loop gang worker vector reduction(+ : change_in_energy) #else diff --git a/src/ParticleBC/PartBoundCond.h b/src/ParticleBC/PartBoundCond.h index 47ab7e235..7afd6ca9c 100755 --- a/src/ParticleBC/PartBoundCond.h +++ b/src/ParticleBC/PartBoundCond.h @@ -44,7 +44,7 @@ class PartBoundCond } else { int *const cell_keys = species->particles->getPtrCellKeys(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr( cell_keys ) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) diff --git a/src/Particles/Particles.cpp b/src/Particles/Particles.cpp index 34eaeb161..30c685155 100755 --- a/src/Particles/Particles.cpp +++ b/src/Particles/Particles.cpp @@ -1311,7 +1311,7 @@ void Particles::copyLeavingParticlesToBuffers( const vector copy, const ve // where direction goes from 0 to 6 and tells which way the particle escapes. // If the cell_key is -1, the particle must be destroyed so it is not extracted. -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) // GPU @@ -1398,13 +1398,13 @@ int Particles::eraseLeavingParticles() return 0; } -int Particles::injectParticles( Particles *particles_to_inject ) +int Particles::injectParticles( Particles */*particles_to_inject*/ ) { ERROR( "Device only feature, should not have come here! On CPU it's done in sortParticles." ); return 0; } -void Particles::importAndSortParticles( Particles *particles_to_inject ) +void Particles::importAndSortParticles( Particles */*particles_to_inject*/ ) { ERROR( "Device only feature, should not have come here! On CPU it's done in sortParticles." ); } diff --git a/src/Particles/ParticlesFactory.cpp b/src/Particles/ParticlesFactory.cpp index 00f51bbb0..34e9a3a83 100755 --- a/src/Particles/ParticlesFactory.cpp +++ b/src/Particles/ParticlesFactory.cpp @@ -7,7 +7,7 @@ // ----------------------------------------------------------------------------- #include "ParticlesFactory.h" -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) extern "C" void* CreateGPUParticles( const void* parameters, const void* a_parent_patch ); #endif @@ -22,7 +22,7 @@ Particles* ParticlesFactory::create( const Params& parameters, // We export a C interface to avoid potential ABI problems // that could occur when using two different compilers (e.g., one to // compile cuda/hip and another one for the host code). -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) particles = static_cast( CreateGPUParticles( ¶meters, &a_parent_patch ) ); #else SMILEI_UNUSED( a_parent_patch ); diff --git a/src/Patch/Patch.cpp b/src/Patch/Patch.cpp index 8fa4022aa..ca76c6ece 100755 --- a/src/Patch/Patch.cpp +++ b/src/Patch/Patch.cpp @@ -445,7 +445,7 @@ void Patch::setLocationAndAllocateFields( Params ¶ms, DomainDecomposition *d Patch::~Patch() { -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU deleteFieldsOnDevice(); #endif @@ -1153,7 +1153,7 @@ void Patch::computePoynting() { } } -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU // --------------------------------------------------------------------------------------------------------------------- // Allocate data on device diff --git a/src/Patch/Patch.h b/src/Patch/Patch.h index ff5a76a5c..8d06d21c2 100755 --- a/src/Patch/Patch.h +++ b/src/Patch/Patch.h @@ -194,7 +194,7 @@ class Patch //! delete Particles included in the index of particles to exchange. Assumes indexes are sorted. void cleanupSentParticles( int ispec, std::vector *indexes_of_particles_to_exchange ); -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU //! Allocate and copy all the field grids on device void allocateAndCopyFieldsOnDevice(); diff --git a/src/Patch/SyncVectorPatch.cpp b/src/Patch/SyncVectorPatch.cpp index 675529113..7f2cd183e 100755 --- a/src/Patch/SyncVectorPatch.cpp +++ b/src/Patch/SyncVectorPatch.cpp @@ -2,7 +2,7 @@ #include "SyncVectorPatch.h" #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif #include "Params.h" @@ -269,7 +269,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches.densitiesMPIx[ifield ]->extract_fields_sum( 0, iNeighbor, oversize[0] ); vecPatches.densitiesMPIx[ifield+nPatchMPIx ]->extract_fields_sum( 0, iNeighbor, oversize[0] ); vecPatches.densitiesMPIx[ifield+2*nPatchMPIx]->extract_fields_sum( 0, iNeighbor, oversize[0] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIx[ifield ]; // double* Jx = field->sendFields_[iNeighbor]->data_; // int sizeofJx = field->sendFields_[iNeighbor]->size(); @@ -291,7 +291,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc // iDim = 0, local const int nFieldLocalx = vecPatches.densitiesLocalx.size() / 3; -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) // At initialization, we may get a CPU buffer than needs to be handled on the host. const bool is_memory_on_device = vecPatches.densitiesLocalx.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( vecPatches.densitiesLocalx[0]->data() ); @@ -324,9 +324,9 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc pt2 = &( vecPatches.densitiesLocalx[ifield]->data_[0] ); //Sum 2 ==> 1 - const int last = gsp[0] * ny_ * nz_; + const unsigned int last = gsp[0] * ny_ * nz_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = vecPatches.densitiesLocalx[ifield]->size(); int nspace0 = size[0]; #pragma acc parallel if ( is_memory_on_device) present(pt1[0-nspace0*ny_*nz_:ptsize],pt2[0:ptsize]) @@ -358,7 +358,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches( ipatch )->finalizeSumField( vecPatches.densitiesMPIx[ifield+2*nPatchMPIx], 0 ); // Jz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 0, ( iNeighbor+1 )%2 ) ) { -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIx[ifield ]; // double* Jx = field->recvFields_[(iNeighbor+1)%2]->data_; // int sizeofJx = field->recvFields_[(iNeighbor+1)%2]->size(); @@ -402,7 +402,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches.densitiesMPIy[ifield ]->extract_fields_sum( 1, iNeighbor, oversize[1] ); vecPatches.densitiesMPIy[ifield+nPatchMPIy ]->extract_fields_sum( 1, iNeighbor, oversize[1] ); vecPatches.densitiesMPIy[ifield+2*nPatchMPIy]->extract_fields_sum( 1, iNeighbor, oversize[1] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIy[ifield ]; // double* Jx = field->sendFields_[iNeighbor+2]->data_; // int sizeofJx = field->sendFields_[iNeighbor+2]->size(); @@ -424,7 +424,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc // iDim = 1, const int nFieldLocaly = vecPatches.densitiesLocaly.size() / 3; -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) const bool is_memory_on_device = vecPatches.densitiesLocaly.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( vecPatches.densitiesLocaly[0]->data() ); #endif @@ -457,11 +457,11 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc pt1 = &( fields[vecPatches( ipatch )->neighbor_[1][0]-h0+icomp*nPatches]->data_[size[1]*nz_] ); pt2 = &( vecPatches.densitiesLocaly[ifield]->data_[0] ); - const int outer_last = nx_ * ny_ * nz_; - const int outer_stride = ny_ * nz_; - const int inner_last = gsp[1] * nz_; + const unsigned int outer_last = nx_ * ny_ * nz_; + const unsigned int outer_stride = ny_ * nz_; + const unsigned int inner_last = gsp[1] * nz_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = vecPatches.densitiesLocaly[ifield]->size(); int blabla = size[1]; #pragma acc parallel if (is_memory_on_device) present(pt1[0-blabla*nz_:ptsize],pt2[0:ptsize]) @@ -496,7 +496,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches( ipatch )->finalizeSumField( vecPatches.densitiesMPIy[ifield+2*nPatchMPIy], 1 ); // Jz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 1, ( iNeighbor+1 )%2 ) ) { -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIy[ifield ]; // double* Jx = field->recvFields_[(iNeighbor+1)%2+2]->data_; // int sizeofJx = field->recvFields_[(iNeighbor+1)%2+2]->size(); @@ -538,7 +538,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches.densitiesMPIz[ifield ]->extract_fields_sum( 2, iNeighbor, oversize[2] ); vecPatches.densitiesMPIz[ifield+nPatchMPIz ]->extract_fields_sum( 2, iNeighbor, oversize[2] ); vecPatches.densitiesMPIz[ifield+2*nPatchMPIz]->extract_fields_sum( 2, iNeighbor, oversize[2] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIz[ifield ]; // double* Jx = field->sendFields_[iNeighbor+4]->data_; // int sizeofJx = field->sendFields_[iNeighbor+4]->size(); @@ -560,7 +560,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc // iDim = 2 local const int nFieldLocalz = vecPatches.densitiesLocalz.size() / 3; -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) const bool is_memory_on_device = vecPatches.densitiesLocalz.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( vecPatches.densitiesLocalz[0]->data() ); #endif @@ -594,11 +594,11 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc pt1 = &( fields[vecPatches( ipatch )->neighbor_[2][0]-h0+icomp*nPatches]->data_[size[2]] ); pt2 = &( vecPatches.densitiesLocalz[ifield]->data_[0] ); - const int outer_last = nx_ * ny_ * nz_; - const int outer_stride = nz_; - const int inner_last = gsp[2]; + const unsigned int outer_last = nx_ * ny_ * nz_; + const unsigned int outer_stride = nz_; + const unsigned int inner_last = gsp[2]; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = vecPatches.densitiesLocalz[ifield]->size(); int blabla = size[2]; #pragma acc parallel if (is_memory_on_device) present(pt1[0-blabla:ptsize],pt2[0:ptsize]) @@ -630,7 +630,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches( ipatch )->finalizeSumField( vecPatches.densitiesMPIz[ifield+2*nPatchMPIz], 2 ); // Jz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 2, ( iNeighbor+1 )%2 ) ) { -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIz[ifield ]; // double* Jx = field->recvFields_[(iNeighbor+1)%2+4]->data_; // int sizeofJx = field->recvFields_[(iNeighbor+1)%2+4]->size(); @@ -797,7 +797,7 @@ void SyncVectorPatch::exchangeE( Params &, VectorPatch &vecPatches, int imode, S SyncVectorPatch::finalizeExchangeAlongAllDirections( vecPatches.listEt_[imode], vecPatches ); } -void SyncVectorPatch::exchangeBmBTIS3( Params ¶ms, VectorPatch &vecPatches, int imode, SmileiMPI *smpi ) +void SyncVectorPatch::exchangeBmBTIS3( Params &/*params*/, VectorPatch &vecPatches, int imode, SmileiMPI *smpi ) { SyncVectorPatch::exchangeAlongAllDirections,cField>( vecPatches.listBr_mBTIS3[imode], vecPatches, smpi ); SyncVectorPatch::finalizeExchangeAlongAllDirections( vecPatches.listBr_mBTIS3[imode], vecPatches ); @@ -881,7 +881,7 @@ void SyncVectorPatch::exchangeEnvEx( Params ¶ms, VectorPatch &vecPatches, Sm } } -void SyncVectorPatch::exchangeBmBTIS3( Params ¶ms, VectorPatch &vecPatches, SmileiMPI *smpi ) +void SyncVectorPatch::exchangeBmBTIS3( Params &/*params*/, VectorPatch &vecPatches, SmileiMPI *smpi ) { // exchange BmBTIS3 in Cartesian geometries // exchange ByBTIS3 @@ -1487,7 +1487,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongX( std::vector &fields, vecPatches.B_MPIx[ifield ]->extract_fields_exch( 0, iNeighbor, oversize ); vecPatches.B_MPIx[ifield+nMPIx]->create_sub_fields ( 0, iNeighbor, oversize ); vecPatches.B_MPIx[ifield+nMPIx]->extract_fields_exch( 0, iNeighbor, oversize ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B_MPIx[ifield ]; double* By = field->sendFields_[iNeighbor]->data_; int sizeofBy = field->sendFields_[iNeighbor]->size(); @@ -1580,7 +1580,7 @@ void SyncVectorPatch::finalizeExchangeAllComponentsAlongX( VectorPatch &vecPatch vecPatches( ipatch )->finalizeExchange( vecPatches.B_MPIx[ifield+nMPIx], 0 ); // Bz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 0, ( iNeighbor+1 )%2 ) ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B_MPIx[ifield ]; double* By = field->recvFields_[(iNeighbor+1)%2]->data_; int sizeofBy = field->recvFields_[(iNeighbor+1)%2]->size(); @@ -1623,7 +1623,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongY( std::vector &fields, vecPatches.B1_MPIy[ifield ]->extract_fields_exch( 1, iNeighbor, oversize ); vecPatches.B1_MPIy[ifield+nMPIy]->create_sub_fields ( 1, iNeighbor, oversize ); vecPatches.B1_MPIy[ifield+nMPIy]->extract_fields_exch( 1, iNeighbor, oversize ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B1_MPIy[ifield ]; double* Bx = field->sendFields_[iNeighbor+2]->data_; int sizeofBx = field->sendFields_[iNeighbor+2]->size(); @@ -1671,7 +1671,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongY( std::vector &fields, if( vecPatches( ipatch )->MPI_me_ == vecPatches( ipatch )->MPI_neighbor_[1][0] ) { pt1 = &( fields[vecPatches( ipatch )->neighbor_[1][0]-h0+icomp*nPatches]->data_[size*nz_] ); pt2 = &( vecPatches.B1_localy[ifield]->data_[0] ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC int ptsize = vecPatches.B1_localy[ifield]->size(); #pragma acc parallel present(pt1[0-size*nz_:ptsize],pt2[0:ptsize]) #pragma acc loop gang worker vector @@ -1711,7 +1711,7 @@ void SyncVectorPatch::finalizeExchangeAllComponentsAlongY( VectorPatch &vecPatch vecPatches( ipatch )->finalizeExchange( vecPatches.B1_MPIy[ifield+nMPIy], 1 ); // Bz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 1, ( iNeighbor+1 )%2 ) ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B1_MPIy[ifield ]; double* Bx = field->recvFields_[(iNeighbor+1)%2+2]->data_; int sizeofBx = field->recvFields_[(iNeighbor+1)%2+2]->size(); @@ -1754,7 +1754,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongZ( std::vector fields, vecPatches.B2_MPIz[ifield ]->extract_fields_exch( 2, iNeighbor, oversize ); vecPatches.B2_MPIz[ifield+nMPIz]->create_sub_fields ( 2, iNeighbor, oversize ); vecPatches.B2_MPIz[ifield+nMPIz]->extract_fields_exch( 2, iNeighbor, oversize ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B2_MPIz[ifield ]; double* Bx = field->sendFields_[iNeighbor+4]->data_; int sizeofBx = field->sendFields_[iNeighbor+4]->size(); @@ -1799,7 +1799,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongZ( std::vector fields, if( vecPatches( ipatch )->MPI_me_ == vecPatches( ipatch )->MPI_neighbor_[2][0] ) { pt1 = &( fields[vecPatches( ipatch )->neighbor_[2][0]-h0+icomp*nPatches]->data_[size] ); pt2 = &( vecPatches.B2_localz[ifield]->data_[0] ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC int ptsize = vecPatches.B2_localz[ifield]->size(); #pragma acc parallel present(pt1[0-size:ptsize],pt2[0:ptsize]) #pragma acc loop gang worker vector @@ -1839,7 +1839,7 @@ void SyncVectorPatch::finalizeExchangeAllComponentsAlongZ( VectorPatch &vecPatch vecPatches( ipatch )->finalizeExchange( vecPatches.B2_MPIz[ifield+nMPIz], 2 ); // By for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 2, ( iNeighbor+1 )%2 ) ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B2_MPIz[ifield ]; double* Bx = field->recvFields_[(iNeighbor+1)%2+4]->data_; int sizeofBx = field->recvFields_[(iNeighbor+1)%2+4]->size(); diff --git a/src/Patch/SyncVectorPatch.h b/src/Patch/SyncVectorPatch.h index 0322c1283..07435cd49 100755 --- a/src/Patch/SyncVectorPatch.h +++ b/src/Patch/SyncVectorPatch.h @@ -73,7 +73,7 @@ public : if ( vecPatches( ipatch )->is_a_MPI_neighbor( 0, iNeighbor ) ) { fields[ifield]->create_sub_fields ( 0, iNeighbor, 2*oversize[0]+1+fields[ifield]->isDual_[0] ); fields[ifield]->extract_fields_sum( 0, iNeighbor, oversize[0] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // double * pointer = fields[ifield]->sendFields_[iNeighbor]->data_; // int size = fields[ifield]->size(); // #endif @@ -87,7 +87,7 @@ public : // iDim = 0, local -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) // At initialization, we may get a CPU buffer than needs to be handled on the host. const bool is_memory_on_device = fields.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( fields[0]->data() ); @@ -123,7 +123,7 @@ public : const unsigned int last = gsp[0] * ny_ * nz_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = fields[ifield]->size(); int nspace0 = size[0]; #pragma acc parallel if ( is_memory_on_device) present(pt1[0-nspace0*ny_*nz_:ptsize],pt2[0:ptsize]) @@ -177,7 +177,7 @@ public : if ( vecPatches( ipatch )->is_a_MPI_neighbor( 1, iNeighbor ) ) { fields[ifield]->create_sub_fields ( 1, iNeighbor, 2*oversize[1]+1+fields[ifield]->isDual_[1] ); fields[ifield]->extract_fields_sum( 1, iNeighbor, oversize[1] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // double* pointer = fields[ifield]->recvFields_[(iNeighbor+1)%2]->data_; // int size = fields[ifield]->recvFields_[(iNeighbor+1)%2]->size(); // //#pragma acc update device( Jx[0:sizeofJx], Jy[0:sizeofJy], Jz[0:sizeofJz] ) @@ -192,7 +192,7 @@ public : // iDim = 1, local -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) const bool is_memory_on_device = fields.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( fields[0]->data() ); #endif @@ -220,11 +220,11 @@ public : pt1 = &( *field1 )( size[1]*nz_ ); pt2 = &( *field2 )( 0 ); - const int outer_last = nx_ * ny_ * nz_; - const int outer_stride = ny_ * nz_; - const int inner_last = gsp[1] * nz_; + const unsigned int outer_last = nx_ * ny_ * nz_; + const unsigned int outer_stride = ny_ * nz_; + const unsigned int inner_last = gsp[1] * nz_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = fields[ifield]->size(); int blabla = size[1]; #pragma acc parallel if (is_memory_on_device) present(pt1[0-blabla*nz_:ptsize],pt2[0:ptsize]) @@ -282,7 +282,7 @@ public : if ( vecPatches( ipatch )->is_a_MPI_neighbor( 2, iNeighbor ) ) { fields[ifield]->create_sub_fields ( 2, iNeighbor, 2*oversize[2]+1+fields[ifield]->isDual_[2] ); fields[ifield]->extract_fields_sum( 2, iNeighbor, oversize[2] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // double* pointer = fields[ifield]->recvFields_[(iNeighbor+1)%2+2]->data_; // int size = fields[ifield]->recvFields_[(iNeighbor+1)%2+2]->size(); // #endif @@ -293,7 +293,7 @@ public : // iDim = 2 local -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) const bool is_memory_on_device = fields.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( fields[0]->data() ); #endif @@ -321,11 +321,11 @@ public : pt1 = &( *field1 )( size[2] ); pt2 = &( *field2 )( 0 ); - const int outer_last = nx_ * ny_ * nz_; - const int outer_stride = nz_; - const int inner_last = gsp[2]; + const unsigned int outer_last = nx_ * ny_ * nz_; + const unsigned int outer_stride = nz_; + const unsigned int inner_last = gsp[2]; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = fields[ifield]->size(); int blabla = size[2]; #pragma acc parallel if (is_memory_on_device) present(pt1[0-blabla:ptsize],pt2[0:ptsize]) diff --git a/src/Patch/VectorPatch.cpp b/src/Patch/VectorPatch.cpp index 65d68f28c..42f4dd3d8 100755 --- a/src/Patch/VectorPatch.cpp +++ b/src/Patch/VectorPatch.cpp @@ -301,7 +301,7 @@ void VectorPatch::reconfiguration( Params ¶ms, Timers &timers, int itime ) // --------------------------------------------------------------------------------------------------------------------- void VectorPatch::initialParticleSorting( Params ¶ms ) { -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC) // Initially I wanted to control the GPU particle sorting/bin initialization // here. In the end it was put in initializeDataOnDevice which is more // meaningful. @@ -853,7 +853,7 @@ void VectorPatch::sumDensities( Params ¶ms, double time_dual, Timers &timers #pragma omp for schedule(static) for( unsigned int ipatch=0 ; ipatchsize() ; ipatch++ ) { // Per species in global, Attention if output -> Sync / per species fields -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) // At itime == 0, data is still located on the Host if (itime == 0) { ( *this )( ipatch )->EMfields->computeTotalRhoJ(); @@ -1269,7 +1269,7 @@ void VectorPatch::closeAllDiags( SmileiMPI *smpi ) // --------------------------------------------------------------------------------------------------------------------- void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int itime, Timers &timers, SimWindow *simWindow ) { -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) bool data_on_cpu_updated = false; #endif @@ -1277,7 +1277,7 @@ void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int timers.diags.restart(); // Determine which data is required from the device -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) bool need_particles = false; bool need_fields = false; @@ -1346,7 +1346,7 @@ void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int for( unsigned int idiag = 0 ; idiag < globalDiags.size() ; idiag++ ) { diag_timers_[idiag]->restart(); -// #if defined( SMILEI_ACCELERATOR_MODE) +// #if defined( SMILEI_ACCELERATOR_GPU) // if( globalDiags[idiag]->timeSelection->theTimeIsNow( itime ) && // !data_on_cpu_updated && // ( itime > 0 ) ) { @@ -1462,7 +1462,7 @@ void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int for( unsigned int idiag = 0 ; idiag < localDiags.size() ; idiag++ ) { diag_timers_[globalDiags.size()+idiag]->restart(); -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // if( localDiags[idiag]->timeSelection->theTimeIsNow( itime ) && // !data_on_cpu_updated && // ( itime > 0 ) ) { @@ -1496,7 +1496,7 @@ void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int for( unsigned int ipatch=0 ; ipatchEMfields->restartRhoJs(); -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) // Delete species current and rho grids from device for( unsigned int ispec = 0; ispec < ( *this )( ipatch )->vecSpecies.size(); ispec++ ) { ( *this )( ipatch )->vecSpecies[ispec]->Species::deleteSpeciesCurrentAndChargeOnDevice(ispec, ( *this )( ipatch )->EMfields); @@ -4402,7 +4402,7 @@ void VectorPatch::moveWindow( // Bring all particles and field grids to the Host (except species grids) // This part can be optimized by copying only the patch to be destructed -#if defined( SMILEI_ACCELERATOR_MODE) +#if defined( SMILEI_ACCELERATOR_GPU) if( simWindow->isMoving( time_dual ) || itime == simWindow->getAdditionalShiftsIteration() ) { copyParticlesFromDeviceToHost(); copyFieldsFromDeviceToHost(); @@ -4412,10 +4412,11 @@ void VectorPatch::moveWindow( simWindow->shift( (*this), smpi, params, itime, time_dual, region ); - if (itime == simWindow->getAdditionalShiftsIteration() ) { + if( itime == (int) simWindow->getAdditionalShiftsIteration() ) { int adjust = simWindow->isMoving(time_dual)?0:1; - for (unsigned int n=0;n < simWindow->getNumberOfAdditionalShifts()-adjust; n++) + for( unsigned int n=0; n < simWindow->getNumberOfAdditionalShifts()-adjust; n++ ) { simWindow->shift( (*this), smpi, params, itime, time_dual, region ); + } } // Copy all Fields and Particles to the device @@ -4423,7 +4424,7 @@ void VectorPatch::moveWindow( // let's try initialising like we do at the start: -/*#if defined( SMILEI_ACCELERATOR_MODE ) +/*#if defined( SMILEI_ACCELERATOR_GPU ) // Allocate particle and field arrays // Also copy particle array content on device vecPatches.allocateDataOnDevice( params, &smpi, @@ -4434,7 +4435,7 @@ void VectorPatch::moveWindow( #endif*/ // does not do anything? - /*#if defined( SMILEI_ACCELERATOR_MODE) + /*#if defined( SMILEI_ACCELERATOR_GPU) if( simWindow->isMoving( time_dual ) || itime == simWindow->getAdditionalShiftsIteration() ) { copyFieldsFromHostToDevice(); copyParticlesFromHostToDevice(); @@ -4609,13 +4610,12 @@ void VectorPatch::initNewEnvelope( Params & ) } // END initNewEnvelope +#if defined( SMILEI_ACCELERATOR_GPU ) void VectorPatch::allocateDataOnDevice(Params ¶ms, SmileiMPI *smpi, RadiationTables *radiation_tables, MultiphotonBreitWheelerTables *multiphoton_Breit_Wheeler_tables) { - -#if defined( SMILEI_ACCELERATOR_MODE ) // TODO(Etienne M): FREE. If we have load balancing or other patch // creation/destruction available (which is not the case on GPU ATM), // we should be taking care of freeing this GPU memory. @@ -4681,17 +4681,24 @@ void VectorPatch::allocateDataOnDevice(Params ¶ms, smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( min_particle_chi_table, min_particle_chi_size ); smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( xi_table, xi_table_size ); } +} #else +void VectorPatch::allocateDataOnDevice(Params &, + SmileiMPI *, + RadiationTables *, + MultiphotonBreitWheelerTables *) +{ ERROR( "GPU related code should not be reached in CPU mode!" ); -#endif } +#endif + //! Clean data allocated on device +#if defined( SMILEI_ACCELERATOR_GPU ) void VectorPatch::cleanDataOnDevice( Params ¶ms, SmileiMPI *smpi, RadiationTables *radiation_tables, MultiphotonBreitWheelerTables *multiphoton_Breit_Wheeler_tables) { -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) const int npatches = this->size(); @@ -4801,12 +4808,17 @@ void VectorPatch::cleanDataOnDevice( Params ¶ms, SmileiMPI *smpi, smilei::tools::gpu::HostDeviceMemoryManagement::DeviceFree( xi_table, xi_table_size ); } +} #else +void VectorPatch::cleanDataOnDevice( Params &, SmileiMPI *, + RadiationTables *, + MultiphotonBreitWheelerTables *) +{ ERROR( "GPU related code should not be reached in CPU mode!" ); -#endif } +#endif -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) //! Field Synchronization from the GPU (Device) to the CPU //! This function updates the data on the host from the data located on the device @@ -4846,9 +4858,7 @@ void VectorPatch::copyFieldsFromHostToDevice() } } -#endif -#if defined( SMILEI_ACCELERATOR_MODE) //! Sync all fields from device to host void VectorPatch::copyFieldsFromDeviceToHost() @@ -4861,10 +4871,6 @@ VectorPatch::copyFieldsFromDeviceToHost() } } -#endif - - -#if defined( SMILEI_ACCELERATOR_MODE) //! Copy all species particles from Host to devices void VectorPatch::copyParticlesFromHostToDevice() @@ -4876,9 +4882,6 @@ void VectorPatch::copyParticlesFromHostToDevice() } } } -#endif - -#if defined( SMILEI_ACCELERATOR_MODE) //! copy all patch Particles from device to Host void @@ -4891,9 +4894,7 @@ VectorPatch::copyParticlesFromDeviceToHost() for( int ipatch = 0; ipatch < npatches; ipatch++ ) { for( unsigned int ispec = 0; ispec < ( *this )( ipatch )->vecSpecies.size(); ispec++ ) { species( ipatch, ispec )->particles->copyFromDeviceToHost(); -#if defined ( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_ACCELERATOR_MODE ) species( ipatch, ispec )->particles->setHostBinIndex(); -#endif // std::cerr // << "ipatch: " << ipatch // << " ispec: " << ispec @@ -4906,9 +4907,6 @@ VectorPatch::copyParticlesFromDeviceToHost() } } -#endif - -#if defined( SMILEI_ACCELERATOR_MODE) //! Sync all fields from device to host void VectorPatch::copySpeciesFieldsFromDeviceToHost() @@ -4988,7 +4986,7 @@ void VectorPatch::dynamicsWithoutTasks( Params ¶ms, if( spec->isProj( time_dual, simWindow ) || diag_flag ) { -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) if (diag_flag) { spec->Species::prepareSpeciesCurrentAndChargeOnDevice( ispec, diff --git a/src/Patch/VectorPatch.h b/src/Patch/VectorPatch.h index be5a37d21..051d78276 100755 --- a/src/Patch/VectorPatch.h +++ b/src/Patch/VectorPatch.h @@ -510,7 +510,7 @@ public : RadiationTables * radiation_tables, MultiphotonBreitWheelerTables *multiphoton_Breit_Wheeler_tables ); -#if defined( SMILEI_ACCELERATOR_MODE) +#if defined( SMILEI_ACCELERATOR_GPU) //! Field Synchronization from the GPU (Device) to the host (CPU) diff --git a/src/Projector/Projector2D2OrderGPU.cpp b/src/Projector/Projector2D2OrderGPU.cpp index cfe20eb7d..c669cc209 100755 --- a/src/Projector/Projector2D2OrderGPU.cpp +++ b/src/Projector/Projector2D2OrderGPU.cpp @@ -26,7 +26,7 @@ Projector2D2OrderGPU::Projector2D2OrderGPU( Params ¶meters, Patch *a_patch ) dts2 = dt / 2.0; dts4 = dts2 / 2.0; -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_ACCELERATOR_GPU_OACC ) // When sorting is disabled, these values are invalid (-1) and the HIP // implementation can't be used. x_dimension_bin_count_ = parameters.getGPUBinCount( 1 ); @@ -41,7 +41,7 @@ Projector2D2OrderGPU::~Projector2D2OrderGPU() // EMPTY } -#if defined( SMILEI_ACCELERATOR_MODE ) //SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU ) //SMILEI_ACCELERATOR_GPU_OMP ) extern "C" void currentDepositionKernel2DOnDevice( double *__restrict__ Jx, @@ -109,6 +109,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy /// Project global current densities (EMfields->Jx_/Jy_/Jz_) /// /* inline */ void +#if defined( SMILEI_ACCELERATOR_GPU )//SMILEI_ACCELERATOR_GPU_OMP ) currents( double *__restrict__ Jx, double *__restrict__ Jy, double *__restrict__ Jz, @@ -132,7 +133,6 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy double, int not_spectral ) { -#if defined( SMILEI_ACCELERATOR_MODE )//SMILEI_ACCELERATOR_GPU_OMP ) currentDepositionKernel2DOnDevice( Jx, Jy, Jz, @@ -159,15 +159,22 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy j_domain_begin, nprimy, not_spectral ); + } #else + currents( double *__restrict__ , double *__restrict__ , double *__restrict__ , int, int, int, + Particles &, unsigned int , unsigned int ,const double *__restrict__ , + const int *__restrict__ , const double *__restrict__ , double , double , double , + double , double , int , int , int , double, int ) + { SMILEI_ASSERT( false ); -#endif } +#endif /// Like currents(), project the particle current on the grid (Jx_/Jy_/Jz_) /// but also compute global current densities rho used for diagFields timestep /// /* inline */ void +#if defined( SMILEI_ACCELERATOR_GPU )//SMILEI_ACCELERATOR_GPU_OMP ) currentsAndDensity( double *__restrict__ Jx, double *__restrict__ Jy, double *__restrict__ Jz, @@ -193,7 +200,6 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy double, int not_spectral ) { -#if defined( SMILEI_ACCELERATOR_MODE )//SMILEI_ACCELERATOR_GPU_OMP ) currentAndDensityDepositionKernelOnDevice( Jx, Jy, Jz, @@ -222,10 +228,16 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy j_domain_begin, nprimy, not_spectral ); + } #else + currentsAndDensity( double *__restrict__ , double *__restrict__ , double *__restrict__ , double *__restrict__ , + int , int , int , int , Particles &, unsigned int , unsigned int , + const double *__restrict__ , const int *__restrict__ , const double *__restrict__ , + double , double , double , double , double , int , int , int , double, int ) + { SMILEI_ASSERT( false ); -#endif } +#endif } // namespace @@ -233,7 +245,7 @@ void Projector2D2OrderGPU::basic( double *rhoj, Particles &particles, unsigned int ipart, unsigned int type, - int bin_shift ) + int /*bin_shift*/ ) { // Warning : this function is used for frozen species only. It is assumed that position = position_old !!! @@ -306,12 +318,12 @@ void Projector2D2OrderGPU::basic( double *rhoj, } } -void Projector2D2OrderGPU::ionizationCurrents( Field *Jx, - Field *Jy, - Field *Jz, - Particles &particles, - int ipart, - LocalFields Jion ) +void Projector2D2OrderGPU::ionizationCurrents( Field */*Jx*/, + Field */*Jy*/, + Field */*Jz*/, + Particles &/*particles*/, + int /*ipart*/, + LocalFields /*Jion */) { ERROR( "Projector2D2OrderGPU::ionizationCurrents(): Not implemented !" ); } @@ -325,8 +337,8 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, bool diag_flag, bool is_spectral, int ispec, - int icell, - int ipart_ref ) + int /*icell*/, + int /*ipart_ref */) { std::vector &iold = smpi->dynamics_iold[ithread]; std::vector &delta = smpi->dynamics_deltaold[ithread]; @@ -425,20 +437,20 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, } } -void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, - Particles &particles, - double species_mass, - SmileiMPI *smpi, - int istart, - int iend, - int ithread, - int icell, - int ipart_ref ) +void Projector2D2OrderGPU::susceptibility( ElectroMagn */*EMfields*/, + Particles &/*particles*/, + double /*species_mass*/, + SmileiMPI */*smpi*/, + int /*istart*/, + int /*iend*/, + int /*ithread*/, + int /*icell*/, + int /*ipart_ref */) { ERROR( "Projector2D2OrderGPU::susceptibility(): Not implemented !" ); } -//#if defined( SMILEI_ACCELERATOR_MODE ) +//#if defined( SMILEI_ACCELERATOR_GPU ) ////! Project global current densities (EMfields->Jx_/Jy_/Jz_) ////! //extern "C" void diff --git a/src/Projector/Projector2D2OrderGPU.h b/src/Projector/Projector2D2OrderGPU.h index 9a799f9b5..ecdd4959d 100755 --- a/src/Projector/Projector2D2OrderGPU.h +++ b/src/Projector/Projector2D2OrderGPU.h @@ -46,21 +46,21 @@ class Projector2D2OrderGPU : public Projector2D int ipart_ref = 0 ) override; //!Wrapper for task-based implementation of Smilei - void currentsAndDensityWrapperOnBuffers( double *b_Jx, - double *b_Jy, - double *b_Jz, - double *b_rho, - int bin_width, - Particles &particles, - SmileiMPI *smpi, - int istart, - int iend, - int ithread, - bool diag_flag, - bool is_spectral, - int ispec, - int icell = 0, - int ipart_ref = 0 ) override {}; + void currentsAndDensityWrapperOnBuffers( double * /*b_Jx*/, + double * /*b_Jy*/, + double * /*b_Jz*/, + double * /*b_rho*/, + int /*bin_width*/, + Particles &/*particles*/, + SmileiMPI */*smpi*/, + int /*istart*/, + int /*iend*/, + int /*ithread*/, + bool /*diag_flag*/, + bool /*is_spectral*/, + int /*ispec*/, + int /*icell*/ = 0, + int /*ipart_ref*/ = 0 ) override {}; /// Project susceptibility, used as source term in envelope equation /// diff --git a/src/Projector/Projector2D2OrderGPUKernel.cpp b/src/Projector/Projector2D2OrderGPUKernel.cpp index 8f38f52fe..e2ec56495 100644 --- a/src/Projector/Projector2D2OrderGPUKernel.cpp +++ b/src/Projector/Projector2D2OrderGPUKernel.cpp @@ -1,4 +1,4 @@ -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #include "Projector2D2OrderGPUKernelCUDAHIP.h" #include diff --git a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu index 666a409f4..55082b793 100644 --- a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu +++ b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu @@ -81,7 +81,7 @@ // device_particle_charge /* [0:particle_count] */, \ // device_particle_weight /* [0:particle_count] */ ) // #pragma omp teams thread_limit( 64 ) distribute parallel for -// #elif defined( SMILEI_OPENACC_MODE ) +// #elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // #pragma acc parallel \ // deviceptr( device_particle_position_x, \ // device_particle_position_y, \ @@ -264,7 +264,7 @@ // device_particle_charge /* [0:particle_count] */, \ // device_particle_weight /* [0:particle_count] */ ) // #pragma omp teams thread_limit( 64 ) distribute parallel for -// #elif defined( SMILEI_OPENACC_MODE ) +// #elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // #pragma acc parallel \ // deviceptr( device_particle_position_x, \ // device_particle_position_y, \ diff --git a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h index d607a4ab4..a21f757db 100644 --- a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h +++ b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h @@ -4,7 +4,7 @@ #define Projector2D2OrderGPUKernelCUDAHIP_H -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #if defined( __HIP__ ) #include diff --git a/src/Projector/Projector3D2OrderGPU.cpp b/src/Projector/Projector3D2OrderGPU.cpp index 39342b204..62ec54141 100755 --- a/src/Projector/Projector3D2OrderGPU.cpp +++ b/src/Projector/Projector3D2OrderGPU.cpp @@ -30,13 +30,13 @@ Projector3D2OrderGPU::Projector3D2OrderGPU( Params ¶meters, Patch *a_patch ) dts2 = dt / 2.0; dts4 = dts2 / 2.0; -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_ACCELERATOR_GPU_OACC ) // When sorting is disabled, these values are invalid (-1) and the HIP // implementation can't be used. x_dimension_bin_count_ = parameters.getGPUBinCount( 1 ); y_dimension_bin_count_ = parameters.getGPUBinCount( 2 ); z_dimension_bin_count_ = parameters.getGPUBinCount( 3 ); -//#elif defined( SMILEI_OPENACC_MODE ) +//#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // x_dimension_bin_count_ = 1; // y_dimension_bin_count_ = 1; // z_dimension_bin_count_ = 1; @@ -50,7 +50,7 @@ Projector3D2OrderGPU::~Projector3D2OrderGPU() // EMPTY } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) extern "C" void currentDeposition3DOnDevice( double *__restrict__ Jx, double *__restrict__ Jy, @@ -122,6 +122,8 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy /// Project global current densities (EMfields->Jx_/Jy_/Jz_) /// /* inline */ void + +#if defined( SMILEI_ACCELERATOR_GPU ) currents( double *__restrict__ Jx, double *__restrict__ Jy, double *__restrict__ Jz, @@ -150,72 +152,77 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy double, int not_spectral ) { -#if defined( SMILEI_ACCELERATOR_MODE ) currentDeposition3DOnDevice( Jx, - Jy, - Jz, - Jx_size, - Jy_size, - Jz_size, - particles.getPtrPosition( 0 ), - particles.getPtrPosition( 1 ), - particles.getPtrPosition( 2 ), - particles.getPtrCharge(), - particles.getPtrWeight(), - particles.last_index.data(), - x_dimension_bin_count, - y_dimension_bin_count, - z_dimension_bin_count, - invgf_, - iold_, - deltaold_, - particles.deviceSize(), - inv_cell_volume, - dx_inv, - dy_inv, - dz_inv, - dx_ov_dt, - dy_ov_dt, - dz_ov_dt, - i_domain_begin, - j_domain_begin, - k_domain_begin, - nprimy, nprimz, - not_spectral ); + Jy, + Jz, + Jx_size, + Jy_size, + Jz_size, + particles.getPtrPosition( 0 ), + particles.getPtrPosition( 1 ), + particles.getPtrPosition( 2 ), + particles.getPtrCharge(), + particles.getPtrWeight(), + particles.last_index.data(), + x_dimension_bin_count, + y_dimension_bin_count, + z_dimension_bin_count, + invgf_, + iold_, + deltaold_, + particles.deviceSize(), + inv_cell_volume, + dx_inv, + dy_inv, + dz_inv, + dx_ov_dt, + dy_ov_dt, + dz_ov_dt, + i_domain_begin, + j_domain_begin, + k_domain_begin, + nprimy, nprimz, + not_spectral ); + } #else + currents( double *__restrict__ , double *__restrict__ , double *__restrict__ , int, int, int, + Particles &, unsigned int , unsigned int , unsigned int , const double *__restrict__ , + const int *__restrict__ , const double *__restrict__ , double , double , double , double , + double , double , double , int , int , int , int , int , double, int ) + { SMILEI_ASSERT( false ); -#endif } +#endif //! Project density /* inline */ void +#if defined( SMILEI_ACCELERATOR_GPU ) density( - double *__restrict__ rho, - int rho_size, - Particles &particles, - unsigned int x_dimension_bin_count, - unsigned int y_dimension_bin_count, - unsigned int z_dimension_bin_count, - const double *__restrict__ invgf_, - const int *__restrict__ iold_, - const double *__restrict__ deltaold_, - double inv_cell_volume, - double dx_inv, - double dy_inv, - double dz_inv, - double dx_ov_dt, - double dy_ov_dt, - double dz_ov_dt, - int i_domain_begin, - int j_domain_begin, - int k_domain_begin, - int nprimy, - int nprimz, - double, - int not_spectral ) + double *__restrict__ rho, + int rho_size, + Particles &particles, + unsigned int x_dimension_bin_count, + unsigned int y_dimension_bin_count, + unsigned int z_dimension_bin_count, + const double *__restrict__ invgf_, + const int *__restrict__ iold_, + const double *__restrict__ deltaold_, + double inv_cell_volume, + double dx_inv, + double dy_inv, + double dz_inv, + double dx_ov_dt, + double dy_ov_dt, + double dz_ov_dt, + int i_domain_begin, + int j_domain_begin, + int k_domain_begin, + int nprimy, + int nprimz, + double, + int not_spectral ) { -#if defined( SMILEI_ACCELERATOR_MODE ) densityDeposition3DOnDevice( rho, rho_size, @@ -244,10 +251,16 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy k_domain_begin, nprimy, nprimz, not_spectral ); + } #else + density( double *__restrict__ , int , Particles &, unsigned int , unsigned int , unsigned int , + const double *__restrict__ , const int *__restrict__ , const double *__restrict__ , + double , double , double , double , double , double , double , + int, int, int, int, int, double, int ) + { SMILEI_ASSERT( false ); -#endif } +#endif } // namespace @@ -255,7 +268,7 @@ void Projector3D2OrderGPU::basic( double *rhoj, Particles &particles, unsigned int ipart, unsigned int type, - int bin_shift ) + int /*bin_shift*/ ) { @@ -347,12 +360,12 @@ void Projector3D2OrderGPU::basic( double *rhoj, } } -void Projector3D2OrderGPU::ionizationCurrents( Field *Jx, - Field *Jy, - Field *Jz, - Particles &particles, - int ipart, - LocalFields Jion ) +void Projector3D2OrderGPU::ionizationCurrents( Field */*Jx*/, + Field */*Jy*/, + Field */*Jz*/, + Particles &/*particles*/, + int /*ipart*/, + LocalFields /*Jion */) { ERROR( "Projector3D2OrderGPU::ionizationCurrents(): Not implemented !" ); } @@ -366,8 +379,8 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, bool diag_flag, bool is_spectral, int ispec, - int icell, - int ipart_ref ) + int /*icell*/, + int /*ipart_ref*/ ) { if( is_spectral ) { @@ -463,15 +476,15 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, //std::cerr << sum << " " << sum2 << " " << sum_Jxs << " " << sum_Jx << std::endl; } -void Projector3D2OrderGPU::susceptibility( ElectroMagn *EMfields, - Particles &particles, - double species_mass, - SmileiMPI *smpi, - int istart, - int iend, - int ithread, - int icell, - int ipart_ref ) +void Projector3D2OrderGPU::susceptibility( ElectroMagn */*EMfields*/, + Particles &/*particles*/, + double /*species_mass*/, + SmileiMPI */*smpi*/, + int /*istart*/, + int /*iend*/, + int /*ithread*/, + int /*icell*/, + int /*ipart_ref */) { ERROR( "Projector3D2OrderGPU::susceptibility(): Not implemented !" ); } diff --git a/src/Projector/Projector3D2OrderGPU.cpp.backup b/src/Projector/Projector3D2OrderGPU.cpp.backup index 39ce7a4a5..761e6ae31 100755 --- a/src/Projector/Projector3D2OrderGPU.cpp.backup +++ b/src/Projector/Projector3D2OrderGPU.cpp.backup @@ -2,7 +2,7 @@ #include #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #include #endif @@ -136,7 +136,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ Sx0 [0:kTmpArraySize], \ @@ -262,7 +262,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSx [0:kTmpArraySize], sumX [0:kTmpArraySize] ) // #pragma acc parallel deviceptr( DSx, sumX ) @@ -287,7 +287,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jx [0:Jx_size], \ Sy0 [0:kTmpArraySize], \ @@ -310,7 +310,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( const double crx_p = dx_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex0 = iold[ipart+0*packsize]*yz_size0+iold[ipart+1*packsize]*z_size0+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=0 ; k<5 ; k++ ) { @@ -326,7 +326,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp atomic update -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc atomic #endif Jx [ jdx ] += val; @@ -339,7 +339,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSy [0:kTmpArraySize], \ sumX [0:kTmpArraySize] ) @@ -365,7 +365,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jy [0:Jy_size], \ Sx0 [0:kTmpArraySize], \ @@ -388,7 +388,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( const double cry_p = dy_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex1 = iold[ipart+0*packsize]*yz_size1+iold[ipart+1*packsize]*z_size1+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=0 ; k<5 ; k++ ) { @@ -404,7 +404,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp atomic update -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc atomic #endif Jy [ jdx ] += val; @@ -417,7 +417,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSz [0:kTmpArraySize], \ sumX [0:kTmpArraySize] ) @@ -443,7 +443,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jz [0:Jz_size], \ Sx0 [0:kTmpArraySize], \ @@ -466,7 +466,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( const double crz_p = dz_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex2 = iold[ipart+0*packsize]*yz_size2+iold[ipart+1*packsize]*z_size2+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=1 ; k<5 ; k++ ) { @@ -482,7 +482,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp atomic update -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc atomic #endif Jz[ jdx ] += val; @@ -498,7 +498,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ rho [0:rho_size], \ Sx1 [0:kTmpArraySize], \ @@ -523,7 +523,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( int jdx = idx + k; #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp atomic update -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc atomic #endif rho[ jdx ] += charge_weight * Sx1[ipart_pack+i*packsize]*Sy1[ipart_pack+j*packsize]*Sz1[ipart_pack+k*packsize]; diff --git a/src/Projector/Projector3D2OrderGPU.h b/src/Projector/Projector3D2OrderGPU.h index 2fac2402e..c76bf48a1 100755 --- a/src/Projector/Projector3D2OrderGPU.h +++ b/src/Projector/Projector3D2OrderGPU.h @@ -46,21 +46,21 @@ class Projector3D2OrderGPU : public Projector3D int ipart_ref = 0 ) override; //!Wrapper for task-based implementation of Smilei - void currentsAndDensityWrapperOnBuffers( double *b_Jx, - double *b_Jy, - double *b_Jz, - double *b_rho, - int bin_width, - Particles &particles, - SmileiMPI *smpi, - int istart, - int iend, - int ithread, - bool diag_flag, - bool is_spectral, - int ispec, - int icell = 0, - int ipart_ref = 0 ) override {}; + void currentsAndDensityWrapperOnBuffers( double * /*b_Jx*/, + double * /*b_Jy*/, + double * /*b_Jz*/, + double * /*b_rho*/, + int /*bin_width*/, + Particles &/*particles*/, + SmileiMPI */*smpi*/, + int /*istart*/, + int /*iend*/, + int /*ithread*/, + bool /*diag_flag*/, + bool /*is_spectral*/, + int /*ispec*/, + int /*icell*/ = 0, + int /*ipart_ref*/ = 0 ) override {}; /// Project susceptibility, used as source term in envelope equation /// diff --git a/src/Projector/Projector3D2OrderGPUKernel.cpp b/src/Projector/Projector3D2OrderGPUKernel.cpp index f77a4fda3..5d9f88b5d 100644 --- a/src/Projector/Projector3D2OrderGPUKernel.cpp +++ b/src/Projector/Projector3D2OrderGPUKernel.cpp @@ -5,7 +5,7 @@ // issues (!). -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) //! Simple switch to jump between the reference (omp) implementation and the //! hip one. diff --git a/src/Projector/Projector3D2OrderGPUKernelAcc.h b/src/Projector/Projector3D2OrderGPUKernelAcc.h index 9cf3b224d..43bff1cce 100644 --- a/src/Projector/Projector3D2OrderGPUKernelAcc.h +++ b/src/Projector/Projector3D2OrderGPUKernelAcc.h @@ -1,6 +1,6 @@ //! Optimized Acc projection (from Julien Derouillat) -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #include #include "Tools.h" @@ -110,7 +110,7 @@ namespace acc { position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ Sx0 [0:kTmpArraySize], \ @@ -236,7 +236,7 @@ namespace acc { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSx [0:kTmpArraySize], sumX [0:kTmpArraySize] ) // #pragma acc parallel deviceptr( DSx, sumX ) @@ -261,7 +261,7 @@ namespace acc { charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jx [0:Jx_size], \ Sy0 [0:kTmpArraySize], \ @@ -284,7 +284,7 @@ namespace acc { const double crx_p = dx_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex0 = iold[ipart+0*packsize]*yz_size0+iold[ipart+1*packsize]*z_size0+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=0 ; k<5 ; k++ ) { @@ -309,7 +309,7 @@ namespace acc { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSy [0:kTmpArraySize], \ sumX [0:kTmpArraySize] ) @@ -335,7 +335,7 @@ namespace acc { charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jy [0:Jy_size], \ Sx0 [0:kTmpArraySize], \ @@ -358,7 +358,7 @@ namespace acc { const double cry_p = dy_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex1 = iold[ipart+0*packsize]*yz_size1+iold[ipart+1*packsize]*z_size1+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=0 ; k<5 ; k++ ) { @@ -383,7 +383,7 @@ namespace acc { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSz [0:kTmpArraySize], \ sumX [0:kTmpArraySize] ) @@ -409,7 +409,7 @@ namespace acc { charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jz [0:Jz_size], \ Sx0 [0:kTmpArraySize], \ @@ -432,7 +432,7 @@ namespace acc { const double crz_p = dz_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex2 = iold[ipart+0*packsize]*yz_size2+iold[ipart+1*packsize]*z_size2+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=1 ; k<5 ; k++ ) { @@ -536,7 +536,7 @@ namespace acc { position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ Sx1 [0:kTmpArraySize], \ @@ -630,7 +630,7 @@ namespace acc { charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ rho [0:rho_size], \ Sx1 [0:kTmpArraySize], \ diff --git a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu index 195a02667..dd8d1e61d 100644 --- a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu +++ b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu @@ -1,6 +1,6 @@ //! HIP CUDA implementation -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) //#include "Projector3D2OrderGPUKernelCUDAHIP.h" diff --git a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h index 94368f4dd..1b78b1252 100644 --- a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h +++ b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h @@ -4,7 +4,7 @@ #define Projector3D2OrderGPUKernelCUDAHIP_H -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #if defined( __HIP__ ) #include diff --git a/src/Projector/Projector3D2OrderGPUKernelNaive.h b/src/Projector/Projector3D2OrderGPUKernelNaive.h index b6cfac080..a261af40b 100644 --- a/src/Projector/Projector3D2OrderGPUKernelNaive.h +++ b/src/Projector/Projector3D2OrderGPUKernelNaive.h @@ -1,6 +1,6 @@ //! Naive ACC/OMP implementation -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #include #include "Tools.h" @@ -66,7 +66,7 @@ namespace acc { position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ Jx[0:Jx_size], \ @@ -344,7 +344,7 @@ namespace acc { position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ rho[0:rho_size] \ diff --git a/src/Projector/ProjectorAM2OrderV.cpp b/src/Projector/ProjectorAM2OrderV.cpp index b222aa4ee..890d37332 100755 --- a/src/Projector/ProjectorAM2OrderV.cpp +++ b/src/Projector/ProjectorAM2OrderV.cpp @@ -673,10 +673,6 @@ void ProjectorAM2OrderV::susceptibility( ElectroMagn *EMfields, Particles &parti double charge_weight[8] __attribute__( ( aligned( 64 ) ) ); // double r_bar[8] __attribute__( ( aligned( 64 ) ) ); - //double *invR_local = &(invR_[jpom2]); - // double *invRd_local = &(invRd_[jpom2]); - - double *invR_local = &(invR_[jpom2]); // Pointer for GPU and vectorization on ARM processors double * __restrict__ position_x = particles.getPtrPosition(0); double * __restrict__ position_y = particles.getPtrPosition(1); diff --git a/src/Projector/ProjectorFactory.h b/src/Projector/ProjectorFactory.h index db8c39e1f..278739301 100755 --- a/src/Projector/ProjectorFactory.h +++ b/src/Projector/ProjectorFactory.h @@ -42,7 +42,7 @@ class ProjectorFactory // --------------- else if( ( params.geometry == "2Dcartesian" ) && ( params.interpolation_order == ( unsigned int )2 ) ) { if( !vectorization ) { - #if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) + #if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) Proj = new Projector2D2OrderGPU( params, patch ); #else Proj = new Projector2D2Order( params, patch ); @@ -64,7 +64,7 @@ class ProjectorFactory // --------------- else if( ( params.geometry == "3Dcartesian" ) && ( params.interpolation_order == ( unsigned int )2 ) ) { if( !vectorization ) { - #if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) + #if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) Proj = new Projector3D2OrderGPU( params, patch ); #else Proj = new Projector3D2Order( params, patch ); diff --git a/src/Pusher/PusherBoris.cpp b/src/Pusher/PusherBoris.cpp index 536def7a9..8f70a6cc3 100755 --- a/src/Pusher/PusherBoris.cpp +++ b/src/Pusher/PusherBoris.cpp @@ -57,7 +57,7 @@ void PusherBoris::operator()( Particles &particles, SmileiMPI *smpi, int istart, position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_buffer_offset; const int particle_number = iend - istart; diff --git a/src/Pusher/PusherBorisNR.cpp b/src/Pusher/PusherBorisNR.cpp index 84f072e1f..df4a3277b 100755 --- a/src/Pusher/PusherBorisNR.cpp +++ b/src/Pusher/PusherBorisNR.cpp @@ -57,7 +57,7 @@ void PusherBorisNR::operator()( Particles &particles, SmileiMPI *smpi, int istar position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_buffer_offset; const int particle_number = iend - istart; diff --git a/src/Pusher/PusherHigueraCary.cpp b/src/Pusher/PusherHigueraCary.cpp index 2ab234ae1..c85189fff 100755 --- a/src/Pusher/PusherHigueraCary.cpp +++ b/src/Pusher/PusherHigueraCary.cpp @@ -68,7 +68,7 @@ void PusherHigueraCary::operator()( Particles &particles, SmileiMPI *smpi, int i position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_buffer_offset; const int particle_number = iend - istart; diff --git a/src/Pusher/PusherPhoton.cpp b/src/Pusher/PusherPhoton.cpp index a94a521e3..5feb7823d 100755 --- a/src/Pusher/PusherPhoton.cpp +++ b/src/Pusher/PusherPhoton.cpp @@ -53,7 +53,7 @@ void PusherPhoton::operator()( Particles &particles, SmileiMPI *smpi, position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_ref; const int particle_number = iend - istart; diff --git a/src/Pusher/PusherPonderomotiveBoris.cpp b/src/Pusher/PusherPonderomotiveBoris.cpp index 41afa42e6..9d151dabb 100755 --- a/src/Pusher/PusherPonderomotiveBoris.cpp +++ b/src/Pusher/PusherPonderomotiveBoris.cpp @@ -55,7 +55,7 @@ void PusherPonderomotiveBoris::operator()( Particles &particles, SmileiMPI *smpi const double *const __restrict__ GradPhiz = &( ( *GradPhipart )[2*nparts] ); //double *inv_gamma_ponderomotive = &( ( *dynamics_inv_gamma_ponderomotive )[0*nparts] ); - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #else int np = iend-istart; diff --git a/src/Pusher/PusherPonderomotiveBorisBTIS3.cpp b/src/Pusher/PusherPonderomotiveBorisBTIS3.cpp index 379f41763..a32f359cb 100644 --- a/src/Pusher/PusherPonderomotiveBorisBTIS3.cpp +++ b/src/Pusher/PusherPonderomotiveBorisBTIS3.cpp @@ -31,7 +31,6 @@ void PusherPonderomotiveBorisBTIS3::operator()( Particles &particles, SmileiMPI double charge_over_mass_dts2, charge_sq_over_mass_sq_dts4; double umx, umy, umz, upx, upy, upz; double alpha; - double TxTy, TyTz, TzTx; double pxsm, pysm, pzsm; //double one_ov_gamma_ponderomotive; diff --git a/src/Pusher/PusherPonderomotivePositionBoris.cpp b/src/Pusher/PusherPonderomotivePositionBoris.cpp index 16a4e6c69..9b9bea639 100755 --- a/src/Pusher/PusherPonderomotivePositionBoris.cpp +++ b/src/Pusher/PusherPonderomotivePositionBoris.cpp @@ -52,7 +52,7 @@ void PusherPonderomotivePositionBoris::operator()( Particles &particles, SmileiM const double *const __restrict__ GradPhi_my = &( ( *GradPhi_mpart )[1*nparts] ); const double *const __restrict__ GradPhi_mz = &( ( *GradPhi_mpart )[2*nparts] ); - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #else int np = iend-istart; diff --git a/src/Pusher/PusherVay.cpp b/src/Pusher/PusherVay.cpp index c1ba76693..83debaae4 100755 --- a/src/Pusher/PusherVay.cpp +++ b/src/Pusher/PusherVay.cpp @@ -67,7 +67,7 @@ void PusherVay::operator()( Particles &particles, SmileiMPI *smpi, int istart, i position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_buffer_offset; const int particle_number = iend - istart; diff --git a/src/Radiation/RadiationCorrLandauLifshitz.cpp b/src/Radiation/RadiationCorrLandauLifshitz.cpp index 16c7b01fe..ebb0e54dd 100755 --- a/src/Radiation/RadiationCorrLandauLifshitz.cpp +++ b/src/Radiation/RadiationCorrLandauLifshitz.cpp @@ -96,7 +96,7 @@ void RadiationCorrLandauLifshitz::operator()( // cumulative Radiated energy from istart to iend double radiated_energy_loc = 0; -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Local vector to store the radiated energy double * rad_norm_energy = new double [iend-istart]; // double * rad_norm_energy = (double*) aligned_alloc(64, (iend-istart)*sizeof(double)); @@ -112,7 +112,7 @@ void RadiationCorrLandauLifshitz::operator()( // Computation // NVIDIA GPUs - #if defined (SMILEI_OPENACC_MODE) + #if defined (SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_ref; const int np = iend-istart; #pragma acc parallel \ @@ -185,7 +185,7 @@ void RadiationCorrLandauLifshitz::operator()( // _______________________________________________________________ // Computation of the thread radiated energy -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Exact energy loss due to the radiation rad_norm_energy[ipart-istart] = gamma - std::sqrt( 1.0 @@ -210,7 +210,7 @@ void RadiationCorrLandauLifshitz::operator()( // _______________________________________________________________ // Update of the quantum parameter -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd for( int ipart=istart ; ipart #include -#if defined(SMILEI_OPENACC_MODE) +#if defined(SMILEI_ACCELERATOR_GPU_OACC) #define __HIP_PLATFORM_NVCC__ #define __HIP_PLATFORM_NVIDIA__ #include "gpuRandom.h" @@ -103,7 +103,7 @@ void RadiationMonteCarlo::operator()( // Temporary double parameter double temp; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC unsigned long long seed; // Parameters for CUDA generator unsigned long long seq; unsigned long long offset; @@ -152,7 +152,7 @@ void RadiationMonteCarlo::operator()( // Number of photons int nphotons; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC int nphotons_start; #endif @@ -160,7 +160,7 @@ void RadiationMonteCarlo::operator()( const double photon_buffer_size_per_particle = radiation_photon_sampling_ * max_photon_emissions_; if (photons) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We reserve a large number of potential photons on device since we can't reallocate nphotons_start = photons->deviceSize(); //static_cast(photons)->deviceReserve( nphotons + (iend - istart) * photon_buffer_size_per_particle ); @@ -199,13 +199,13 @@ void RadiationMonteCarlo::operator()( double *const __restrict__ photon_tau = photons ? (photons->has_Monte_Carlo_process ? photons->getPtrTau() : nullptr) : nullptr; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // Cell keys as a mask int *const __restrict__ photon_cell_keys = photons ? photons->getPtrCellKeys() : nullptr; #endif // Table properties ---------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // Size of tables // int size_of_Table_integfochi = RadiationTables.integfochi_.size_particle_chi_; // int size_of_Table_min_photon_chi = RadiationTables.xi_.size_particle_chi_; @@ -221,7 +221,7 @@ void RadiationMonteCarlo::operator()( // _______________________________________________________________ // Computation -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // Management of the data on GPU though this data region int np = iend-istart; @@ -342,7 +342,7 @@ void RadiationMonteCarlo::operator()( // New final optical depth to reach for emision while( tau[ipart] <= epsilon_tau_ ) { //tau[ipart] = -log( 1.-Rand::uniform() ); - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC tau[ipart] = -std::log( 1.-rand_->uniform() ); #else seed_curand_1 = (int) (ipart+1)*(initial_seed_1+1); //Seed for linear generator @@ -385,7 +385,7 @@ void RadiationMonteCarlo::operator()( // Draw random number in [0,1[ - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC random_number = rand_->uniform(); #else seed_curand_2 = (int) (ipart + 1)*(initial_seed_2 + 1); //Seed for linear generator @@ -433,7 +433,7 @@ void RadiationMonteCarlo::operator()( && ( i_photon_emission < max_photon_emissions_)) { // CPU implementation (non-threaded implementation) -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Creation of new photons in the temporary array photons photons->createParticles( radiation_photon_sampling_ ); @@ -611,14 +611,14 @@ void RadiationMonteCarlo::operator()( } // end while } // end for -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } // end acc parallel #endif //if (photons) std::cerr << photons->deviceSize() << std::endl; // Remove extra space to save memory -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC if (photons) { photons->shrinkToFit( true ); } @@ -631,7 +631,7 @@ void RadiationMonteCarlo::operator()( // ____________________________________________________ // Update of the quantum parameter chi -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #else int np = iend-istart; @@ -660,11 +660,11 @@ void RadiationMonteCarlo::operator()( } - #ifdef SMILEI_OPENACC_MODE + #ifdef SMILEI_ACCELERATOR_GPU_OACC } // end acc parallel #endif -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } // end acc data #endif diff --git a/src/Radiation/RadiationMonteCarlo.h b/src/Radiation/RadiationMonteCarlo.h index 34b8c31db..4e84f169d 100755 --- a/src/Radiation/RadiationMonteCarlo.h +++ b/src/Radiation/RadiationMonteCarlo.h @@ -16,7 +16,7 @@ #include "Radiation.h" #include "userFunctions.h" -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include // This is wrong. Dont include nvidiaParticles, it may cause problem! // See particle factory. diff --git a/src/Radiation/RadiationNiel.cpp b/src/Radiation/RadiationNiel.cpp index 6e61f3759..dff292df4 100755 --- a/src/Radiation/RadiationNiel.cpp +++ b/src/Radiation/RadiationNiel.cpp @@ -127,7 +127,7 @@ void RadiationNiel::operator()( double radiated_energy_loc = 0; // Parameters for linear alleatory number generator - #ifdef SMILEI_OPENACC_MODE + #ifdef SMILEI_ACCELERATOR_GPU_OACC // Initialize initial seed for linear generator double initial_seed = rand_->uniform(); @@ -144,7 +144,7 @@ void RadiationNiel::operator()( //double t0 = MPI_Wtime(); // 1) Vectorized computation of gamma and the particle quantum parameter -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #else @@ -190,12 +190,12 @@ void RadiationNiel::operator()( Ex[ipart-ipart_ref], Ey[ipart-ipart_ref], Ez[ipart-ipart_ref], Bx[ipart-ipart_ref], By[ipart-ipart_ref], Bz[ipart-ipart_ref] ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC } //finish cycle #endif //double t1 = MPI_Wtime(); - #ifdef SMILEI_OPENACC_MODE + #ifdef SMILEI_ACCELERATOR_GPU_OACC if( particle_chi[ipart] > minimum_chi_continuous ) { seed_curand = (int) (ipart+1)*(initial_seed+1); //Seed for linear generator @@ -297,7 +297,7 @@ void RadiationNiel::operator()( if( niel_computation_method == 0 ) { - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC for( ipart=istart ; ipart minimum_chi_continuous ) { @@ -310,7 +310,7 @@ void RadiationNiel::operator()( diffusion[ipart-istart] = std::sqrt( factor_classical_radiated_power*gamma[ipart-ipart_ref]*temp )*random_numbers[ipart-istart]; - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC } } #endif @@ -318,7 +318,7 @@ void RadiationNiel::operator()( // Using the fit at order 5 (vectorized) else if( niel_computation_method == 1 ) { - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd private(temp) for( ipart=istart ; ipart #endif diff --git a/src/Radiation/RadiationTables.h b/src/Radiation/RadiationTables.h index bc5003966..77bcac8e2 100755 --- a/src/Radiation/RadiationTables.h +++ b/src/Radiation/RadiationTables.h @@ -58,7 +58,7 @@ class RadiationTables //! param[in] particle_chi particle quantum parameter //! param[in] particle_gamma particle Lorentz factor //! param[in] integfochi_table table of the discretized integrated f/chi function for Photon production yield computation -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double computePhotonProductionYield( const double particle_chi, @@ -77,7 +77,7 @@ class RadiationTables //! \param[in] xi //! \param[in] table_min_photon_chi //! \param[in] table_xi -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double computeRandomPhotonChiWithInterpolation( double particle_chi, @@ -95,7 +95,7 @@ class RadiationTables //! from the computed table niel_.table //! \param particle_chi particle quantum parameter -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double getHNielFromTable( double particle_chi, double * tableNiel); @@ -116,7 +116,7 @@ class RadiationTables //! \param particle_chi particle quantum parameter //! \param dt time step //#pragma omp declare simd -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif inline double __attribute__((always_inline)) getRidgersCorrectedRadiatedEnergy( const double particle_chi, @@ -138,7 +138,7 @@ class RadiationTables //! Get of the classical continuous radiated energy during dt //! \param particle_chi particle quantum parameter //! \param dt time step -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif inline double __attribute__((always_inline)) getClassicalRadiatedEnergy( double particle_chi, double dt ) @@ -148,7 +148,7 @@ class RadiationTables //! Return the minimum_chi_discontinuous_ value //! Under this value, no discontinuous radiation reaction -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif inline double __attribute__((always_inline)) getMinimumChiDiscontinuous() @@ -158,7 +158,7 @@ class RadiationTables //! Return the minimum_chi_continuous_ value //! Under this value, no continuous radiation reaction -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif inline double __attribute__((always_inline)) getMinimumChiContinuous() diff --git a/src/Radiation/RadiationTools.h b/src/Radiation/RadiationTools.h index 33cb5f501..1746c894e 100644 --- a/src/Radiation/RadiationTools.h +++ b/src/Radiation/RadiationTools.h @@ -32,7 +32,7 @@ class RadiationTools { //! Valid between particle_chi in 1E-3 and 1E1 //! \param particle_chi particle quantum parameter // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) getHNielFitOrder10(double particle_chi) @@ -62,7 +62,7 @@ class RadiationTools { //! Valid between particle_chi in 1E-3 and 1E1 //! \param particle_chi particle quantum parameter // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) getHNielFitOrder5(double particle_chi) @@ -86,7 +86,7 @@ class RadiationTools { //! Ridgers et al., ArXiv 1708.04511 (2017) //! \param particle_chi particle quantum parameter // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) getHNielFitRidgers(double particle_chi) @@ -104,7 +104,7 @@ class RadiationTools { //! approximation formulae //! \param particle_chi particle quantum parameter //#pragma omp declare simd -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) computeGRidgers(double particle_chi) @@ -117,7 +117,7 @@ class RadiationTools { //! Return f1(nu) = Int_nu^\infty K_{5/3}(y) dy //! used in computed synchrotron power spectrum // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) computeF1Nu(double nu) @@ -155,7 +155,7 @@ class RadiationTools { //! Return f2(nu) = BesselK_{2/3}(nu) //! used in computed synchrotron power spectrum // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) computeF2Nu(double nu) @@ -194,7 +194,7 @@ class RadiationTools { //! = Int_nu^\infty K_{5/3}(y) dy + cst * BesselK_{2/3}(nu) //! used in computed synchrotron power spectrum // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) computeBesselPartsRadiatedPower(double nu, double cst) diff --git a/src/Radiation/Table.h b/src/Radiation/Table.h index 8b74aeeaa..a028d4df3 100644 --- a/src/Radiation/Table.h +++ b/src/Radiation/Table.h @@ -45,7 +45,7 @@ class Table void compute_parameters(); //! get value using linear interpolation at position x -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double get(double x); diff --git a/src/Smilei.cpp b/src/Smilei.cpp index eae1993d9..81ba6c258 100755 --- a/src/Smilei.cpp +++ b/src/Smilei.cpp @@ -20,7 +20,7 @@ #include #include #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif @@ -44,7 +44,7 @@ using namespace std; // MAIN CODE // --------------------------------------------------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #ifdef _OPENACC void initialization_openacc() { @@ -80,7 +80,7 @@ int main( int argc, char *argv[] ) // ------------------------- // Create the OpenACC environment -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC initialization_openacc(); #endif @@ -248,7 +248,7 @@ int main( int argc, char *argv[] ) checkpoint.restartAll( vecPatches, region, &smpi, params ); -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) // CPU only, its too early to sort on GPU vecPatches.initialParticleSorting( params ); #endif @@ -271,7 +271,7 @@ int main( int argc, char *argv[] ) PatchesFactory::createVector( vecPatches, params, &smpi, openPMD, &radiation_tables_, 0 ); -#if !(defined( SMILEI_ACCELERATOR_MODE )) +#if !(defined( SMILEI_ACCELERATOR_GPU )) // CPU only, its too early to sort on GPU vecPatches.initialParticleSorting( params ); #endif @@ -407,7 +407,7 @@ int main( int argc, char *argv[] ) } } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) TITLE( "GPU allocation and copy of the fields and particles" ); // Allocate particle and field arrays // Also copy particle array content on device @@ -685,7 +685,7 @@ int main( int argc, char *argv[] ) } //End omp parallel region if( params.has_load_balancing && params.load_balancing_time_selection->theTimeIsNow( itime ) ) { -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // ERROR( "Load balancing not tested on GPU !" ); // #endif count_dlb++; @@ -777,7 +777,7 @@ int main( int argc, char *argv[] ) region.clean(); } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) vecPatches.cleanDataOnDevice( params, &smpi, &radiation_tables_, &multiphoton_Breit_Wheeler_tables_ ); #endif diff --git a/src/SmileiMPI/SmileiMPI.cpp b/src/SmileiMPI/SmileiMPI.cpp index 4fe93fd03..88e03c864 100755 --- a/src/SmileiMPI/SmileiMPI.cpp +++ b/src/SmileiMPI/SmileiMPI.cpp @@ -763,7 +763,7 @@ void SmileiMPI::isend_species( Patch *patch, int to, int &irequest, int tag, Par irequest ++; } -#if defined( SMILEI_ACCELERATOR_MODE) +#if defined( SMILEI_ACCELERATOR_GPU) // For the particles for( unsigned int ispec=0; ispec &requests, int tag, bool send_xmax_bc ) { -// #if defined (SMILEI_ACCELERATOR_MODE) +// #if defined (SMILEI_ACCELERATOR_GPU) // isendOnDevice( EM->Ex_, to, tag+irequest, requests[irequest] ); // irequest++; @@ -1745,7 +1745,7 @@ int SmileiMPI::recv_PML(ElectroMagn *EM, Tpml embc, int bcId, int from, int tag void SmileiMPI::recv( ElectroMagn *EM, int from, int &tag, bool recv_xmin_bc ) { -// #if defined (SMILEI_ACCELERATOR_MODE) +// #if defined (SMILEI_ACCELERATOR_GPU) // recvOnDevice( EM->Ex_, from, tag ); // tag++; @@ -2121,7 +2121,7 @@ void SmileiMPI::isend( Field *field, int to, int tag, MPI_Request &request ) } // End isend ( Field ) -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) //! Sends the whole Field Device to Device (assuming MPI enables it) void SmileiMPI::isendOnDevice( Field *field, int to, int tag, MPI_Request &request ) { @@ -2194,7 +2194,7 @@ void SmileiMPI::recv( Field *field, int from, int tag ) } // End recv ( Field ) -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) void SmileiMPI::recvOnDevice( Field *field, int from, int tag ) { @@ -2524,7 +2524,7 @@ void SmileiMPI::eraseBufferParticleTrail( const int ndim, const int istart, cons } -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) template static inline void diff --git a/src/SmileiMPI/SmileiMPI.h b/src/SmileiMPI/SmileiMPI.h index 13cacc416..2785921de 100755 --- a/src/SmileiMPI/SmileiMPI.h +++ b/src/SmileiMPI/SmileiMPI.h @@ -103,7 +103,7 @@ class SmileiMPI //! Sends the whole Field void isend( Field *field, int to, int tag, MPI_Request &request ); //! Sends the whole Field Device to Device (assuming MPI enables it) -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) void isendOnDevice( Field *field, int to, int tag, MPI_Request &request ); #endif @@ -114,7 +114,7 @@ class SmileiMPI //! Receives the whole Field void recv( Field *field, int from, int tag); //! Receives the whole Field Device to Device (assuming MPI enables it) -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) void recvOnDevice( Field *field, int from, int tag); #endif @@ -248,7 +248,7 @@ class SmileiMPI //! Erase Particles from istart ot the end in the buffers of thread ithread void eraseBufferParticleTrail( const int ndim, const int istart, const int ithread, bool isAM = false ); -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) //! Map CPU buffers onto the GPU to at least accommodate particle_count //! particles. This method tries to reduce the number of //! allocation/deallocation which produces a lot of fragmentation on some diff --git a/src/Species/Species.cpp b/src/Species/Species.cpp index 65358f555..089e25f27 100755 --- a/src/Species/Species.cpp +++ b/src/Species/Species.cpp @@ -500,7 +500,7 @@ Species::~Species() } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) //! Prepare the species Current and Rho grids on Device void Species::prepareSpeciesCurrentAndChargeOnDevice( @@ -540,7 +540,7 @@ Species::prepareSpeciesCurrentAndChargeOnDevice( } -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( Jx_s[0:Jx_size], \ Jy_s[0:Jy_size], \ Jz_s[0:Jz_size], \ @@ -551,7 +551,7 @@ Species::prepareSpeciesCurrentAndChargeOnDevice( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc loop gang worker vector #endif for( unsigned int i=0 ; icopyFromHostToDevice(); } -#endif // end if SMILEI_ACCELERATOR_MODE +#endif // end if SMILEI_ACCELERATOR_GPU // --------------------------------------------------------------------------------------------------------------------- //! Method calculating the Particle dynamics (interpolation, pusher, projection and more) @@ -700,7 +700,7 @@ void Species::dynamics( double time_dual, if( time_dual>time_frozen_ || Ionize) { // moving particle // Prepare temporary buffers for this iteration -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) smpi->resizeDeviceBuffers( ithread, nDim_field, particles->numberOfParticles() ); @@ -713,7 +713,7 @@ void Species::dynamics( double time_dual, patch->startFineTimer(mBW_timer_id_); -#if defined( SMILEI_OPENACC_MODE) +#if defined( SMILEI_ACCELERATOR_GPU_OACC) static_cast(mBW_pair_particles_[0])->deviceResize( particles->deviceSize() * Multiphoton_Breit_Wheeler_process->getPairCreationSampling(0) ); static_cast(mBW_pair_particles_[0])->resetCellKeys(); static_cast(mBW_pair_particles_[1])->deviceResize( particles->deviceSize() * Multiphoton_Breit_Wheeler_process->getPairCreationSampling(1) ); @@ -726,7 +726,7 @@ void Species::dynamics( double time_dual, patch->stopFineTimer(mBW_timer_id_); } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) // Make sure some bin preconditions are respected SMILEI_ASSERT( particles->first_index.size() == 1 ); SMILEI_ASSERT( particles->last_index.size() >= 1 ); @@ -832,7 +832,7 @@ void Species::dynamics( double time_dual, // Compression of the bins if necessary if( Multiphoton_Breit_Wheeler_process ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC removeTaggedParticles(smpi, &particles->first_index[0], &particles->last_index[0], @@ -1690,14 +1690,14 @@ void Species::dynamicsImportParticles( double time_dual, Params ¶ms, Patch * // Radiation losses if( Radiate && photon_species_ ) { // If creation of macro-photon, we add them to photon_species -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We first erase empty slots in the buffer of photons // radiation_photons_->cell_keys is used as a mask static_cast(radiated_photons_)->eraseLeavingParticles(); #endif photon_species_->importParticles( params, patch, *radiated_photons_, localDiags, time_dual ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We explicitely clear the device Particles static_cast(radiated_photons_)->deviceClear(); #endif @@ -1709,7 +1709,7 @@ void Species::dynamicsImportParticles( double time_dual, Params ¶ms, Patch * // Addition of the electron-positron particles for( int k=0; k<2; k++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We first erase empty slots in the buffer of photons // radiation_photons_->cell_keys is used as a mask static_cast(mBW_pair_particles_[k])->eraseLeavingParticles(); @@ -1717,7 +1717,7 @@ void Species::dynamicsImportParticles( double time_dual, Params ¶ms, Patch * mBW_pair_species_[k]->importParticles( params, patch, *mBW_pair_particles_[k], localDiags, time_dual ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We explicitely clear the device Particles static_cast(mBW_pair_particles_[k])->deviceClear(); #endif @@ -1771,7 +1771,7 @@ void Species::computeCharge( ElectroMagn *EMfields, bool old /*=false*/ ) void Species::sortParticles( Params ¶ms ) { -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) // ----------------------------- // GPU version @@ -2096,7 +2096,7 @@ void Species::countSortParticles( Params ¶ms ) // Move all particles from another species to this one void Species::importParticles( Params ¶ms, Patch *patch, Particles &source_particles, vector &localDiags, double time_dual, Ionization *I ) { -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) // --------------------------------------------------- // GPU version // Warning: the GPU version does not handle bin and sorting @@ -2207,7 +2207,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { const int nparts = smpi->dynamics_Epart[ithread].size()/3; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC double *const __restrict__ weight = particles->getPtrWeight(); @@ -2246,7 +2246,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { const int nbin = particles->numberOfBins(); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel \ present(Ex[0:nparts],Ey[0:nparts],Ez[0:nparts], \ Bx[0:nparts], By[0:nparts], Bz[0:nparts], \ @@ -2291,7 +2291,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { if (copy_particle_number>0) { -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC particles->overwriteParticle(copy_first_index, particles->last_index[ibin], copy_particle_number, compute_cell_keys ); #else for (auto ipart = 0 ; ipart < copy_particle_number ; ipart ++) { @@ -2346,7 +2346,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { } } -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC if (thetaold) { for( unsigned int ipart = 0 ; ipart < copy_particle_number ; ipart ++ ) { thetaold[copy_first_index + ipart] = thetaold[particles->last_index[ibin] + ipart]; @@ -2384,7 +2384,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { } } -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } // end parallel region #endif @@ -2418,7 +2418,7 @@ void Species::removeTaggedParticlesPerBin( // Weight shortcut double *const __restrict__ weight = particles->getPtrWeight(); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC double *const __restrict__ position_x = particles->getPtrPosition( 0 ); double *const __restrict__ position_y = nDim_particle > 1 ? particles->getPtrPosition( 1 ) : nullptr; double *const __restrict__ position_z = nDim_particle > 2 ? particles->getPtrPosition( 2 ) : nullptr; @@ -2436,7 +2436,7 @@ void Species::removeTaggedParticlesPerBin( // Total number of bins / cells const int nbin = particles->numberOfBins(); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel \ present(Epart[0:nparts*3],\ Bpart[0:nparts*3], \ @@ -2478,7 +2478,7 @@ void Species::removeTaggedParticlesPerBin( if( ipart < last_photon_index ) { // The last existing photon comes to the position of // the deleted photon -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC particles->overwriteParticle( last_photon_index, ipart, compute_cell_keys ); #else weight[ipart] = weight[last_photon_index]; @@ -2512,7 +2512,7 @@ void Species::removeTaggedParticlesPerBin( } gamma[ipart] = gamma[0*nparts+last_photon_index]; -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC if (thetaold) { thetaold[0*nparts+ipart] = thetaold[0*nparts+last_photon_index]; } @@ -2539,13 +2539,14 @@ void Species::removeTaggedParticlesPerBin( } // if last_index[ibin] > first_index[ibin] } // end loop over the bins -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } // end parallel region #endif } //! This method removes particles with a negative weight //! when a single bin is used +#ifdef SMILEI_ACCELERATOR_GPU_OACC void Species::removeTaggedParticles( SmileiMPI *smpi, int *const first_index, @@ -2554,8 +2555,6 @@ void Species::removeTaggedParticles( bool compute_cell_keys) { -#ifdef SMILEI_OPENACC_MODE - unsigned int new_n_parts = 0; unsigned int nb_deleted = 0; @@ -2623,7 +2622,7 @@ void Species::removeTaggedParticles( // that will not be erased // Backward loop over the tagged particles to fill holes in the photon particle array (at the bin level only) -//#ifdef SMILEI_OPENACC_MODE +//#ifdef SMILEI_ACCELERATOR_GPU_OACC // #pragma acc loop seq //#endif for( int ipart=last_moving_index-1 ; ipart>=*first_index; ipart-- ) { @@ -2700,9 +2699,9 @@ void Species::removeTaggedParticles( } } // if nparts > 0 +} #endif -} // ------------------------------------------------ // Set position when using restart & moving window diff --git a/src/Species/Species.h b/src/Species/Species.h index 83a2bab9d..d4af3bf9d 100755 --- a/src/Species/Species.h +++ b/src/Species/Species.h @@ -6,7 +6,7 @@ // #include "PyTools.h" #include "Particles.h" -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include "nvidiaParticles.h" #endif #include "Params.h" @@ -382,7 +382,7 @@ class Species return particles->capacity(); } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) void allocateParticlesOnDevice(); @@ -566,12 +566,14 @@ class Species //! This method removes particles with a negative weight //! when a single bin is used +#ifdef SMILEI_ACCELERATOR_GPU_OACC void removeTaggedParticles( SmileiMPI *smpi, int *const first_index, int *const last_index, int ithread, bool compute_cell_keys = false); +#endif //! Moving window boundary conditions managment void disableXmax(); diff --git a/src/Tools/Pragma.h b/src/Tools/Pragma.h index b1a81cdae..0fb5e1e9d 100644 --- a/src/Tools/Pragma.h +++ b/src/Tools/Pragma.h @@ -31,7 +31,7 @@ #if defined ( SMILEI_ACCELERATOR_GPU_OMP ) #define ATOMIC(mode) \ _Pragma( TOSTRING(omp atomic mode)) -#elif defined ( SMILEI_OPENACC_MODE ) +#elif defined ( SMILEI_ACCELERATOR_GPU_OACC ) #define ATOMIC(mode) \ _Pragma( TOSTRING(acc atomic mode)) #endif diff --git a/src/Tools/gpu.cpp b/src/Tools/gpu.cpp index 7ce000e03..497786096 100644 --- a/src/Tools/gpu.cpp +++ b/src/Tools/gpu.cpp @@ -1,6 +1,6 @@ #include "gpu.h" -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) && defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) && defined( SMILEI_ACCELERATOR_GPU_OACC ) #error "You can not enable both OpenACC and OpenMP GPU support" #endif @@ -29,7 +29,7 @@ #else #error "Asking for OpenMP support without enabling compiler support for OpenMP" #endif -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #if defined( _OPENACC ) #include #else @@ -46,11 +46,12 @@ namespace smilei { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target enter data map( alloc \ : byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc enter data create( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -61,11 +62,12 @@ namespace smilei { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target enter data map( to \ : byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc enter data copyin( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -75,11 +77,12 @@ namespace smilei { const unsigned char* byte_array = static_cast( a_host_pointer ); #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target update to( byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc update device( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -89,11 +92,12 @@ namespace smilei { unsigned char* byte_array = static_cast( a_host_pointer ); #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target update from( byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc update host( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -104,11 +108,12 @@ namespace smilei { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target exit data map( from \ : byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc exit data copyout( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -119,11 +124,12 @@ namespace smilei { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target exit data map( delete \ : byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc exit data delete( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -154,7 +160,7 @@ namespace smilei { SMILEI_ASSERT( a_device_pointer != nullptr ); return const_cast( a_device_pointer ); -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) //return const_cast( ::acc_deviceptr( a_host_pointer ) ); return ::acc_deviceptr( const_cast(a_host_pointer) ) ; #else @@ -171,7 +177,7 @@ namespace smilei { a_count * an_object_size, 0, 0, device_num, device_num ) != 0 ) { ERROR( "omp_target_memcpy failed" ); } -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // It seems that the interface of ::acc_memcpy_device does not accept ptr to array of const type ! // https://www.openacc.org/sites/default/files/inline-files/OpenACC.2.7.pdf // void acc_memcpy_device( d_void* dest, d_void* src, size_t bytes ); diff --git a/src/Tools/gpu.h b/src/Tools/gpu.h index 28a8c98da..bc6552986 100644 --- a/src/Tools/gpu.h +++ b/src/Tools/gpu.h @@ -19,7 +19,7 @@ namespace smilei { #define SMILEI_ACCELERATOR_DECLARE_ROUTINE _Pragma( "omp declare target" ) #define SMILEI_ACCELERATOR_DECLARE_ROUTINE_END _Pragma( "omp end declare target" ) #define SMILEI_ACCELERATOR_ATOMIC _Pragma( "omp atomic update" ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #define SMILEI_ACCELERATOR_DECLARE_ROUTINE _Pragma( "acc routine seq" ) #define SMILEI_ACCELERATOR_DECLARE_ROUTINE_END #define SMILEI_ACCELERATOR_ATOMIC _Pragma( "acc atomic" ) diff --git a/src/Tools/gpuRandom.h b/src/Tools/gpuRandom.h index 916a7b8f8..bdb9aca59 100644 --- a/src/Tools/gpuRandom.h +++ b/src/Tools/gpuRandom.h @@ -1,7 +1,7 @@ #ifndef GPU_RANDOM #define GPU_RANDOM -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) // #include #include "curand_kernel.h" #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -29,7 +29,7 @@ namespace smilei { { protected: using State = -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) ::curandState_t; #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) // TODO @@ -42,7 +42,7 @@ namespace smilei { public: Random() -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) : a_state_{ 0xDEADBEEFU } #else @@ -53,26 +53,36 @@ namespace smilei { } // Initialization +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) void init( unsigned long long seed, unsigned long long seq, unsigned long long offset ) { -#if defined( SMILEI_OPENACC_MODE ) // Cuda generator initialization ::curand_init( seed, seq, offset, &a_state_ ); + } #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + void init( unsigned long long seed, + unsigned long long , + unsigned long long ) + { // Hip generator initialization // ::hiprand_init( seed, seq, offset, &state ); a_state_ = State{ static_cast( seed ) }; + } #else + void init( unsigned long long seed, + unsigned long long , + unsigned long long ) + { a_state_ = State{ static_cast( seed ) }; -#endif } +#endif // Initialization double uniform() { -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) return ::curand_uniform( &a_state_ ); #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) // TODO diff --git a/src/Tools/userFunctions.h b/src/Tools/userFunctions.h index 63753fb20..d9525723d 100755 --- a/src/Tools/userFunctions.h +++ b/src/Tools/userFunctions.h @@ -1,5 +1,5 @@ -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif @@ -36,7 +36,7 @@ class userFunctions //! \param array array in which to find the value //! \param elem element to be found //! \param nb_elem number of elements -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif template From da51604bac5a2590f4e04df582ff9d38d223c08a Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Mon, 27 May 2024 14:59:52 +0200 Subject: [PATCH 35/54] update ci --- .gitlab-ci.yml | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f50bfd819..cf3208df7 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,8 +15,7 @@ install: stage: install only: - develop - - particle_exchange - + script: # Force workdir cleaning in case of retried - echo "CI_PIPELINE_ID = " $CI_PIPELINE_ID @@ -34,8 +33,7 @@ compile_default: stage: compile_default only: - develop - - particle_exchange - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -46,8 +44,7 @@ runQuick: stage: run_quick only: - develop - - particle_exchange - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -58,8 +55,7 @@ run1D: stage: run_default only: - develop - - particle_exchange - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -71,8 +67,7 @@ run2D: stage: run_default only: - develop - - particle_exchange - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -86,8 +81,7 @@ run3D: stage: run_default only: - develop - - particle_exchange - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -102,8 +96,7 @@ runAM: stage: run_default only: - develop - - particle_exchange - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -115,8 +108,7 @@ runCollisions: stage: run_default only: - develop - - particle_exchange - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation From 2d0474ae533225023478e40ef0a757537d665b7c Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Tue, 28 May 2024 10:49:09 +0200 Subject: [PATCH 36/54] test CI From 83ee20d1a895da93ce092af8627bb009f65f63ab Mon Sep 17 00:00:00 2001 From: Arnaud Beck Date: Tue, 28 May 2024 11:43:27 +0200 Subject: [PATCH 37/54] retest CI --- .gitlab-ci.yml | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cf3208df7..e2efed6dc 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,8 +8,8 @@ stages: - compile_debug - compile_no_mpi_threadmultiple - compile_no_openmp - - compile_omptasks - - run_omptasks +# - compile_omptasks +# - run_omptasks install: stage: install @@ -164,21 +164,21 @@ compile_no_openmp: - make clean - python validation/validation.py -k noopenmp -c -v -compile_omptasks: - stage: compile_omptasks - only: - - develop - - script: - - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei - - make clean - - python validation/validation.py -k omptasks -c -v - -run_omptasks: - stage: run_omptasks - only: - - develop - - script: - - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation - - python validation.py -k omptasks -b "tst2d_tasks_01_radiation_pressure_acc.py" -m 4 -o 4 -n 1 -v +#compile_omptasks: +# stage: compile_omptasks +# only: +# - develop +# +# script: +# - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei +# - make clean +# - python validation/validation.py -k omptasks -c -v +# +#run_omptasks: +# stage: run_omptasks +# only: +# - develop +# +# script: +# - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation +# - python validation.py -k omptasks -b "tst2d_tasks_01_radiation_pressure_acc.py" -m 4 -o 4 -n 1 -v From b9754d7101c874e68359bb636916cb08f24520e9 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Fri, 31 May 2024 18:08:35 +0200 Subject: [PATCH 38/54] support matplotlib 3.9 --- doc/Sphinx/Use/namelist.rst | 32 +++++++++++++++++++------------- happi/_Utils.py | 5 ++++- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/doc/Sphinx/Use/namelist.rst b/doc/Sphinx/Use/namelist.rst index 6c5eaf2be..a07f19005 100755 --- a/doc/Sphinx/Use/namelist.rst +++ b/doc/Sphinx/Use/namelist.rst @@ -3347,19 +3347,20 @@ for instance:: def my_filter(particles): return (particles.px>-1.)*(particles.px<1.) + (particles.pz>3.) -.. Note:: The ``px``, ``py`` and ``pz`` quantities are not exactly the momenta. - They are actually the velocities multiplied by the lorentz factor, i.e., - :math:`\gamma v_x`, :math:`\gamma v_y` and :math:`\gamma v_z`. This is true only - inside the ``filter`` function (not for the output of the diagnostic). - -.. Note:: The ``id`` attribute contains the :doc:`particles identification number`. - This number is set to 0 at the beginning of the simulation. **Only after particles have - passed the filter**, they acquire a positive ``id``. - -.. Note:: For advanced filtration, Smilei provides the quantity ``Main.iteration``, - accessible within the ``filter`` function. Its value is always equal to the current - iteration number of the PIC loop. The current time of the simulation is thus - ``Main.iteration * Main.timestep``. +.. Note:: + + * In the ``filter`` function only, the ``px``, ``py`` and ``pz`` quantities + are not exactly the momenta. + They are actually the velocities multiplied by the lorentz factor, i.e., + :math:`\gamma v_x`, :math:`\gamma v_y` and :math:`\gamma v_z`. + This is *not* true for the output of the diagnostic. + * The ``id`` attribute contains the :doc:`particles identification number`. + This number is set to 0 at the beginning of the simulation. **Only after particles have + passed the filter**, they acquire a positive ``id``. + * For advanced filtration, Smilei provides the quantity ``Main.iteration``, + accessible within the ``filter`` function. Its value is always equal to the current + iteration number of the PIC loop. The current time of the simulation is thus + ``Main.iteration * Main.timestep``. .. py:data:: attributes @@ -3372,6 +3373,11 @@ for instance:: (``"chi"``, only for species with radiation losses) or the fields interpolated at their positions (``"Ex"``, ``"Ey"``, ``"Ez"``, ``"Bx"``, ``"By"``, ``"Bz"``). +.. Note:: Here, interpolated fields are normally computed after the Maxwell solver. + They may thus differ by half a timestep from those computed at the middle of the + timestep to push particles. When exact values are needed, use the option + :py:data:`keep_interpolated_fields`. + ---- .. rst-class:: experimental diff --git a/happi/_Utils.py b/happi/_Utils.py index 28dd028df..070046786 100755 --- a/happi/_Utils.py +++ b/happi/_Utils.py @@ -42,7 +42,10 @@ def updateMatplotLibColormaps(): if "smilei" in matplotlib.pyplot.colormaps(): return def register(name, d): cmap = matplotlib.colors.LinearSegmentedColormap(name, d, N=256, gamma=1.0) - matplotlib.pyplot.register_cmap(cmap=cmap) + try: + matplotlib.pyplot.register_cmap(cmap=cmap) + except Exception as e: + matplotlib.colormaps.register(cmap) register(u"smilei", { 'red' :((0., 0., 0.), (0.0625 , 0.091, 0.091), (0.09375, 0.118, 0.118), (0.125 , 0.127, 0.127), (0.1875 , 0.135, 0.135), (0.21875, 0.125, 0.125), (0.28125, 0.034, 0.034), (0.3125 , 0.010, 0.010), (0.34375, 0.009, 0.009), (0.4375 , 0.049, 0.049), (0.46875, 0.057, 0.057), (0.5 , 0.058, 0.058), (0.59375, 0.031, 0.031), (0.625 , 0.028, 0.028), (0.65625, 0.047, 0.047), (0.71875, 0.143, 0.143), (0.78125, 0.294, 0.294), (0.84375, 0.519, 0.519), (0.90625, 0.664, 0.664), (0.9375 , 0.760, 0.760), (0.96875, 0.880, 0.880), (1., 1., 1. )), 'green':((0., 0., 0.), (0.21875, 0.228, 0.228), (0.78125, 0.827, 0.827), (0.8125 , 0.852, 0.852), (0.84375, 0.869, 0.869), (0.9375 , 0.937, 0.937), (0.96875, 0.967, 0.967), (1. , 1. , 1. )), From d5eadb44ad81b974c52b9d61a9473903c57f33a8 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Fri, 31 May 2024 18:32:03 +0200 Subject: [PATCH 39/54] Fix recent commit for laser offset --- src/Params/Params.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/Params/Params.cpp b/src/Params/Params.cpp index b1fafcb09..69973d104 100755 --- a/src/Params/Params.cpp +++ b/src/Params/Params.cpp @@ -1063,19 +1063,18 @@ Params::Params( SmileiMPI *smpi, std::vector namelistsFiles ) : ERROR_NAMELIST( "For LaserOffset #" << n_laser_offset << ": space_time_profile needs 2 profiles.", LINK_NAMELIST + std::string("#lasers") ); } vector profiles_n; + vector profiles_kept; for( unsigned int i = 0; i < 2; i++ ) { - if( profiles[i] == Py_None ) { - Py_DECREF( profiles[i] ); - profiles.erase( profiles.begin() ); - } else { - profiles_n.push_back( i ); + if( profiles[i] != Py_None ) { + profiles_kept.push_back( profiles[i] ); + profiles_n.push_back( i + 1 ); } } - if( profiles.size() == 0 ) { + if( profiles_kept.size() == 0 ) { ERROR_NAMELIST( "For LaserOffset #" << n_laser_offset << ": space_time_profile cannot be [None, None]", LINK_NAMELIST + std::string("#lasers") ); } - for( unsigned int i=0; i namelistsFiles ) : // Make the propagation happen and write out the file if( ! smpi->test_mode ) { - propagateX( profiles, profiles_n, offset, file, keep_n_strongest_modes, angle_z ); + propagateX( profiles_kept, profiles_n, offset, file, keep_n_strongest_modes, angle_z ); } } From 4642c1b79eb8dabb306c5d252d434cd23e9fb931 Mon Sep 17 00:00:00 2001 From: "charles.prouveur" Date: Sun, 2 Jun 2024 01:14:50 +0200 Subject: [PATCH 40/54] Fix: Adapting new 1D GPU implementation to the change in macro names --- src/ElectroMagn/ElectroMagn1D.cpp | 8 ++++---- src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp | 4 ++-- src/ElectroMagnSolver/MA_Solver1D_norm.cpp | 8 ++++---- src/ElectroMagnSolver/MF_Solver1D_Yee.cpp | 4 ++-- src/Field/Field1D.cpp | 14 +++++++------- src/Interpolator/Interpolator1D2Order.cpp | 10 +++++----- src/Projector/Projector1D2OrderGPU.cpp | 10 +++++----- src/Projector/Projector1D2OrderGPU.h | 2 +- src/Projector/Projector1D2OrderGPUKernelCUDAHIP.cu | 4 ++-- src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h | 2 +- src/Projector/Projector2D2OrderGPU.cpp | 2 +- src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h | 1 - src/Projector/ProjectorFactory.h | 6 +++--- 13 files changed, 37 insertions(+), 38 deletions(-) diff --git a/src/ElectroMagn/ElectroMagn1D.cpp b/src/ElectroMagn/ElectroMagn1D.cpp index 4891912fb..d90c6ee2e 100755 --- a/src/ElectroMagn/ElectroMagn1D.cpp +++ b/src/ElectroMagn/ElectroMagn1D.cpp @@ -570,7 +570,7 @@ void ElectroMagn1D::centerMagneticFields() // for Bx^(p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofBx = Bx_->size(); const int sizeofBy = By_->size(); const int sizeofBz = Bz_->size(); @@ -586,7 +586,7 @@ void ElectroMagn1D::centerMagneticFields() } // for By^(d) & Bz^(d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(Bz1D[0:sizeofBz],Bz1D_m[0:sizeofBz],By1D[0:sizeofBy],By1D_m[0:sizeofBy]) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -601,7 +601,7 @@ void ElectroMagn1D::centerMagneticFields() if (use_BTIS3){ double *const By1D_oldBTIS3 = By_mBTIS3->data(); double *const Bz1D_oldBTIS3 = Bz_mBTIS3->data(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofByBTIS3 = By_mBTIS3->size(); const int sizeofBzBTIS3 = Bz_mBTIS3->size(); #pragma acc parallel present(By1D_oldBTIS3[0:sizeofByBTIS3],By1D[0:sizeofBy],Bz1D_oldBTIS3[0:sizeofBzBTIS3],Bz1D[0:sizeofBz]) @@ -610,7 +610,7 @@ void ElectroMagn1D::centerMagneticFields() #pragma omp target #pragma omp teams distribute parallel for #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int i=0 ; igetAmplitude1( pos, time_dual, 0, 0 ); } -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC const int sizeofE1 = E[1]->number_of_points_; const int sizeofE2 = E[2]->number_of_points_; const int sizeofB1 = B[1]->number_of_points_; @@ -102,7 +102,7 @@ void ElectroMagnBC1D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * #endif // Apply Silver-Mueller EM boundary condition at x=xmin or xmax -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E1[0:sizeofE1],E2[0:sizeofE2],B1[0:sizeofB1],B2[0:sizeofB2]) #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target diff --git a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp index 4ef123b2d..a5ceccabe 100755 --- a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp +++ b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp @@ -32,7 +32,7 @@ void MA_Solver1D_norm::operator()( ElectroMagn *fields ) // Solve Maxwell-Ampere // -------------------- // Calculate the electrostatic field ex on the dual grid -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofEx = fields->Ex_->number_of_points_; const int sizeofEy = fields->Ey_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; @@ -45,7 +45,7 @@ void MA_Solver1D_norm::operator()( ElectroMagn *fields ) #pragma omp target #pragma omp teams distribute parallel for #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int ix=0 ; ixEy_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; const int sizeofBy = fields->By_->number_of_points_; @@ -43,7 +43,7 @@ void MF_Solver1D_Yee::operator()( ElectroMagn *fields ) #pragma omp target #pragma omp teams distribute parallel for #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int ix=1 ; ixallocateAndCopyFromHostToDevice(); recvFields_[iDim * 2 + iNeighbor]->allocateAndCopyFromHostToDevice(); @@ -279,7 +279,7 @@ void Field1D::create_sub_fields ( int iDim, int iNeighbor, int ghost_size ) #endif } else if( ghost_size != (int) sendFields_[iDim*2+iNeighbor]->dims_[iDim] ) { -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU ) ERROR( "To Do GPU : envelope" ); #endif delete sendFields_[iDim*2+iNeighbor]; @@ -313,7 +313,7 @@ void Field1D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) const unsigned field_last = ix + NX - 1; #pragma omp target if( should_manipulate_gpu_memory ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) const int subSize = sendFields_[iDim*2+iNeighbor]->size(); const int fSize = number_of_points_; bool fieldName( (name.substr(0,1) == "B") ); @@ -348,7 +348,7 @@ void Field1D::inject_fields_exch ( int iDim, int iNeighbor, int ghost_size ) #pragma omp target if( should_manipulate_gpu_memory ) \ map( tofrom : field [field_first:field_last - field_first] ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); const int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "B" ); @@ -384,7 +384,7 @@ void Field1D::extract_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma omp target if( should_manipulate_gpu_memory ) \ map( to : field [field_first:field_last - field_first] ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) const int subSize = sendFields_[iDim*2+iNeighbor]->size(); const int fSize = number_of_points_; bool fieldName( ((name.substr(0,1) == "J") || (name.substr(0,1) == "R") ) && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub )); @@ -419,7 +419,7 @@ void Field1D::inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma omp target if( should_manipulate_gpu_memory ) \ map( tofrom : field [field_first:field_last - field_first] ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "J" || name.substr(0,1) == "R"); diff --git a/src/Interpolator/Interpolator1D2Order.cpp b/src/Interpolator/Interpolator1D2Order.cpp index dd245bfd1..0af89f3e6 100755 --- a/src/Interpolator/Interpolator1D2Order.cpp +++ b/src/Interpolator/Interpolator1D2Order.cpp @@ -143,7 +143,7 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, const double *const __restrict__ By1D = static_cast( EMfields->By_m )->data(); const double *const __restrict__ Bz1D = static_cast( EMfields->Bz_m )->data(); -#if defined(SMILEI_OPENACC_MODE) +#if defined(SMILEI_ACCELERATOR_GPU_OACC) const int sizeofEx = EMfields->Ex_->size(); const int sizeofEy = EMfields->Ey_->size(); const int sizeofEz = EMfields->Ez_->size(); @@ -163,7 +163,7 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target map( to : i_domain_begin_) is_device_ptr (position_x) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 0 * nparts ) - first_index; @@ -210,7 +210,7 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, delta[0*nparts+ipart] = delta_p[0]; } // end ipart loop - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif @@ -222,7 +222,7 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target map( to : i_domain_begin_) is_device_ptr ( position_x) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 1 * nparts ) - first_index; @@ -286,7 +286,7 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, delta[0*nparts+ipart] = delta_p[0]; } // end ipart loop - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif } // end with B-TIS interpolation diff --git a/src/Projector/Projector1D2OrderGPU.cpp b/src/Projector/Projector1D2OrderGPU.cpp index c63223885..19493ef8d 100755 --- a/src/Projector/Projector1D2OrderGPU.cpp +++ b/src/Projector/Projector1D2OrderGPU.cpp @@ -1,6 +1,6 @@ -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #include "Projector1D2OrderGPUKernelCUDAHIP.h" #include #include "Tools.h" @@ -23,7 +23,7 @@ Projector1D2OrderGPU::Projector1D2OrderGPU( Params ¶meters, Patch *a_patch ) not_spectral_ = !parameters.is_pxr; dts2_ = parameters.timestep / 2.0; dts4_ = dts2_ / 2.0; -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) x_dimension_bin_count_ = parameters.getGPUBinCount( 1 ); #else ERROR( "Only usable in GPU mode! " ); @@ -33,7 +33,7 @@ Projector1D2OrderGPU::Projector1D2OrderGPU( Params ¶meters, Patch *a_patch ) Projector1D2OrderGPU::~Projector1D2OrderGPU() { } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) //! Project global current densities (EMfields->Jx_/Jy_/Jz_) @@ -216,7 +216,7 @@ void Projector1D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, // Does not compute Rho ! -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) currentAndDensityDepositionKernel1DOnDevice( b_Jx,b_Jy,b_Jz,b_rho, Jx_size, Jy_size, Jz_size, rho_size, @@ -245,7 +245,7 @@ void Projector1D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, } else{ -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) currentDepositionKernel1DOnDevice(Jx_, Jy_, Jz_, EMfields->Jx_->size(), EMfields->Jy_->size(), EMfields->Jz_->size(), particles.getPtrPosition( 0 ), diff --git a/src/Projector/Projector1D2OrderGPU.h b/src/Projector/Projector1D2OrderGPU.h index 7ce78af1e..f35e8e4ee 100755 --- a/src/Projector/Projector1D2OrderGPU.h +++ b/src/Projector/Projector1D2OrderGPU.h @@ -64,7 +64,7 @@ class Projector1D2OrderGPU : public Projector1D int ispec, int icell = 0, int ipart_ref = 0 ) override {}; -/*#if defined( SMILEI_ACCELERATOR_MODE ) +/*#if defined( SMILEI_ACCELERATOR_GPU ) extern "C" void currentDepositionKernel1DOnDevice( double *__restrict__ Jx, diff --git a/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.cu b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.cu index df3c3dbbc..0a77a63db 100755 --- a/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.cu +++ b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.cu @@ -68,7 +68,7 @@ // device_particle_charge /* [0:particle_count] */, \ // device_particle_weight /* [0:particle_count] */ ) // #pragma omp teams thread_limit( 64 ) distribute parallel for -// #elif defined( SMILEI_OPENACC_MODE ) +// #elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // #pragma acc parallel \ // deviceptr( device_particle_position_x, \ // device_particle_momentum_y, \ @@ -217,7 +217,7 @@ // device_particle_charge /* [0:particle_count] */, \ // device_particle_weight /* [0:particle_count] */ ) // #pragma omp teams thread_limit( 64 ) distribute parallel for -// #elif defined( SMILEI_OPENACC_MODE ) +// #elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // #pragma acc parallel \ // deviceptr( device_particle_position_x, \ // device_particle_momentum_y, \ diff --git a/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h index 37cabb963..f5e64e408 100755 --- a/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h +++ b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h @@ -3,7 +3,7 @@ #ifndef Projector1D2OrderGPUKernelCUDAHIP_H #define Projector1D2OrderGPUKernelCUDAHIP_H -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #if defined( __HIP__ ) #include diff --git a/src/Projector/Projector2D2OrderGPU.cpp b/src/Projector/Projector2D2OrderGPU.cpp index 119556fb6..a91a29dde 100755 --- a/src/Projector/Projector2D2OrderGPU.cpp +++ b/src/Projector/Projector2D2OrderGPU.cpp @@ -200,7 +200,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy double, int not_spectral ) { - currentAndDensityDepositionKernelOnDevice( Jx, + currentAndDensityDepositionKernel2DOnDevice( Jx, Jy, Jz, rho, diff --git a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h index 6e5fec26c..d789796ab 100755 --- a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h +++ b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h @@ -3,7 +3,6 @@ #ifndef Projector2D2OrderGPUKernelCUDAHIP_H #define Projector2D2OrderGPUKernelCUDAHIP_H - #if defined( SMILEI_ACCELERATOR_GPU ) #if defined( __HIP__ ) diff --git a/src/Projector/ProjectorFactory.h b/src/Projector/ProjectorFactory.h index 2d6a94f90..5b1f50e37 100755 --- a/src/Projector/ProjectorFactory.h +++ b/src/Projector/ProjectorFactory.h @@ -34,7 +34,7 @@ class ProjectorFactory // 1Dcartesian simulation // --------------- if( ( params.geometry == "1Dcartesian" ) && ( params.interpolation_order == ( unsigned int )2 ) ) { - #if defined( SMILEI_ACCELERATOR_MODE ) + #if defined( SMILEI_ACCELERATOR_GPU ) Proj = new Projector1D2OrderGPU( params, patch ); #else Proj = new Projector1D2Order( params, patch ); @@ -47,7 +47,7 @@ class ProjectorFactory // --------------- else if( ( params.geometry == "2Dcartesian" ) && ( params.interpolation_order == ( unsigned int )2 ) ) { if( !vectorization ) { - #if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) + #if defined( SMILEI_ACCELERATOR_GPU ) Proj = new Projector2D2OrderGPU( params, patch ); #else Proj = new Projector2D2Order( params, patch ); @@ -69,7 +69,7 @@ class ProjectorFactory // --------------- else if( ( params.geometry == "3Dcartesian" ) && ( params.interpolation_order == ( unsigned int )2 ) ) { if( !vectorization ) { - #if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) + #if defined( SMILEI_ACCELERATOR_GPU ) Proj = new Projector3D2OrderGPU( params, patch ); #else Proj = new Projector3D2Order( params, patch ); From 7323b5bd44c31bc5cfb865a725e4e7648ceb5483 Mon Sep 17 00:00:00 2001 From: "charles.prouveur" Date: Mon, 3 Jun 2024 13:31:24 +0200 Subject: [PATCH 41/54] Fixing error in 1D MA solver introduced with its GPU implementation --- src/ElectroMagnSolver/MA_Solver1D_norm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp index a5ceccabe..803ffc6cb 100755 --- a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp +++ b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp @@ -65,8 +65,8 @@ void MA_Solver1D_norm::operator()( ElectroMagn *fields ) #pragma omp simd #endif for( unsigned int ix=0 ; ix Date: Mon, 3 Jun 2024 18:45:16 +0200 Subject: [PATCH 42/54] Fixing typo in 1D SM BC --- src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp b/src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp index be9945707..ff767bc12 100755 --- a/src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp +++ b/src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp @@ -111,7 +111,7 @@ void ElectroMagnBC1D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * //( *By1D )( iB_ ) = -sign_*Alpha_*( *Ez1D )( iE_ ) + Beta_*( ( *By1D )( iB_old_ )-By_val_ ) + Gamma_*by + By_val_; //( *Bz1D )( iB_ ) = sign_*Alpha_*( *Ey1D )( iE_ ) + Beta_*( ( *Bz1D )( iB_old_ )-Bz_val_ ) + Gamma_*bz + Bz_val_; B1[ iB_ ] = -sign_ * Alpha_ * E2[iE_] + Beta_ * ( B1[iB_old_] - By_val_) + Gamma_ * by + By_val_; - B2[ iB_ ] = -sign_ * Alpha_ * E1[iE_] + Beta_ * ( B2[iB_old_] - Bz_val_) + Gamma_ * bz + Bz_val_; + B2[ iB_ ] = sign_ * Alpha_ * E1[iE_] + Beta_ * ( B2[iB_old_] - Bz_val_) + Gamma_ * bz + Bz_val_; } } From 5fa52043ae845a09e21408108dda8323c01e482a Mon Sep 17 00:00:00 2001 From: "charles.prouveur" Date: Tue, 4 Jun 2024 12:06:14 +0200 Subject: [PATCH 43/54] reversed some changes in the betis part in interpolation 1D order 2 - to be updated --- src/Interpolator/Interpolator1D2Order.cpp | 170 +++++++++++++++------- src/Interpolator/Interpolator1D2Order.h | 40 ----- 2 files changed, 120 insertions(+), 90 deletions(-) diff --git a/src/Interpolator/Interpolator1D2Order.cpp b/src/Interpolator/Interpolator1D2Order.cpp index 0af89f3e6..b21954702 100755 --- a/src/Interpolator/Interpolator1D2Order.cpp +++ b/src/Interpolator/Interpolator1D2Order.cpp @@ -22,17 +22,16 @@ Interpolator1D2Order::Interpolator1D2Order( Params ¶ms, Patch *patch ) : Int void Interpolator1D2Order::fields( ElectroMagn *EMfields, Particles &particles, int ipart, int nparts, double *ELoc, double *BLoc ) { // Static cast of the electromagnetic fields - Field1D *Ex1D = static_cast( EMfields->Ex_ ); - Field1D *Ey1D = static_cast( EMfields->Ey_ ); - Field1D *Ez1D = static_cast( EMfields->Ez_ ); - Field1D *Bx1D_m = static_cast( EMfields->Bx_m ); - Field1D *By1D_m = static_cast( EMfields->By_m ); - Field1D *Bz1D_m = static_cast( EMfields->Bz_m ); - + Field1D *Ex1D = static_cast( EMfields->Ex_ ); + Field1D *Ey1D = static_cast( EMfields->Ey_ ); + Field1D *Ez1D = static_cast( EMfields->Ez_ ); + Field1D *Bx1D = static_cast( EMfields->Bx_m ); + Field1D *By1D = static_cast( EMfields->By_m ); + Field1D *Bz1D = static_cast( EMfields->Bz_m ); // Particle position (in units of the spatial-step) - double xjn = particles.position( 0, ipart ) * dx_inv_; + double xpn = particles.position( 0, ipart ) * dx_inv_; // Calculate coeffs - coeffs( xjn ); + /*coeffs( xjn ); // Interpolate the fields from the Dual grid : Ex, By, Bz *( ELoc+0*nparts ) = compute( coeffd_, Ex1D, id_ ); @@ -42,7 +41,27 @@ void Interpolator1D2Order::fields( ElectroMagn *EMfields, Particles &particles, // Interpolate the fields from the Primal grid : Ey, Ez, Bx *( ELoc+1*nparts ) = compute( coeffp_, Ey1D, ip_ ); *( ELoc+2*nparts ) = compute( coeffp_, Ez1D, ip_ ); - *( BLoc+0*nparts ) = compute( coeffp_, Bx1D_m, ip_ ); + *( BLoc+0*nparts ) = compute( coeffp_, Bx1D_m, ip_ );*/ + + int idx_p[1], idx_d[1]; + double delta_p[1]; + double coeffxp[3]; + double coeffxd[3]; + + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + + // Interpolation of Ex^(d) + ELoc[0*nparts+ipart] = compute( &coeffxd[0], Ex1D, idx_d[0] ); + // Interpolation of Ey^(p) + ELoc[1*nparts+ipart] = compute( &coeffxp[0], Ey1D, idx_p[0] ); + // Interpolation of Ez^(p) + ELoc[2*nparts+ipart] = compute( &coeffxp[0], Ez1D, idx_p[0] ); + // Interpolation of Bx^(p) + BLoc[0*nparts+ipart] = compute( &coeffxp[0], Bx1D, idx_p[0] ); + // Interpolation of By^(d) + BLoc[1*nparts+ipart] = compute( &coeffxd[0], By1D, idx_d[0] ); + // Interpolation of Bz^(d) + BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] ); }//END Interpolator1D2Order @@ -63,9 +82,9 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & Field1D *Ex1D = static_cast( EMfields->Ex_ ); Field1D *Ey1D = static_cast( EMfields->Ey_ ); Field1D *Ez1D = static_cast( EMfields->Ez_ ); - Field1D *Bx1D_m = static_cast( EMfields->Bx_m ); - Field1D *By1D_m = static_cast( EMfields->By_m ); - Field1D *Bz1D_m = static_cast( EMfields->Bz_m ); + Field1D *Bx1D = static_cast( EMfields->Bx_m ); + Field1D *By1D = static_cast( EMfields->By_m ); + Field1D *Bz1D = static_cast( EMfields->Bz_m ); Field1D *Jx1D = static_cast( EMfields->Jx_ ); Field1D *Jy1D = static_cast( EMfields->Jy_ ); Field1D *Jz1D = static_cast( EMfields->Jz_ ); @@ -78,12 +97,13 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & } // Particle position (in units of the spatial-step) - double xjn = particles.position( 0, ipart )*dx_inv_; + double xpn = particles.position( 0, ipart )*dx_inv_; // Calculate coeffs - coeffs( xjn ); + //coeffs( xjn ); int nparts( particles.numberOfParticles() ); + /* // Interpolate the fields from the Dual grid : Ex, By, Bz *( ELoc+0*nparts ) = compute( coeffd_, Ex1D, id_ ); *( BLoc+1*nparts ) = compute( coeffd_, By1D_m, id_ ); @@ -92,10 +112,46 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & // Interpolate the fields from the Primal grid : Ey, Ez, Bx *( ELoc+1*nparts ) = compute( coeffp_, Ey1D, ip_ ); *( ELoc+2*nparts ) = compute( coeffp_, Ez1D, ip_ ); - *( BLoc+0*nparts ) = compute( coeffp_, Bx1D_m, ip_ ); + *( BLoc+0*nparts ) = compute( coeffp_, Bx1D_m, ip_ );*/ + + int idx_p[1], idx_d[1]; + double delta_p[1]; + double coeffxp[3]; + double coeffxd[3]; + + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + + // Interpolation of Ex^(d) + ELoc[0*nparts+ipart] = compute( &coeffxd[0], Ex1D, idx_d[0] ); + // Interpolation of Ey^(p) + ELoc[1*nparts+ipart] = compute( &coeffxp[0], Ey1D, idx_p[0] ); + // Interpolation of Ez^(p) + ELoc[2*nparts+ipart] = compute( &coeffxp[0], Ez1D, idx_p[0] ); + // Interpolation of Bx^(p) + BLoc[0*nparts+ipart] = compute( &coeffxp[0], Bx1D, idx_p[0] ); + // Interpolation of By^(d) + BLoc[1*nparts+ipart] = compute( &coeffxd[0], By1D, idx_d[0] ); + // Interpolation of Bz^(d) + BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] ); + + // Interpolation of Jx^(d,p) + JLoc->x = compute( &coeffxd[1], Jx1D, idx_d[0] ); + // Interpolation of Jy^(p,d) + JLoc->y = compute( &coeffxp[1], Jy1D, idx_p[0] ); + // Interpolation of Jz^(p,p) + JLoc->z = compute( &coeffxp[1], Jz1D, idx_p[0] ); + // Interpolation of Rho^(p,p) + ( *RhoLoc ) = compute( &coeffxp[1], Rho1D, idx_p[0]); + + if (smpi->use_BTIS3){ + // Interpolation of ByBTIS3^(p,p) + *( BLocyBTIS3+0*nparts ) = compute( &coeffxp[1], By1DBTIS3, idx_p[0]); + // Interpolation of BzBTIS3^(p,d) + *( BLoczBTIS3+0*nparts ) = compute( &coeffxp[1], Bz1DBTIS3, idx_p[0]); + } // Interpolate the fields from the Primal grid : Jy, Jz, Rho - JLoc->y = compute( coeffp_, Jy1D, ip_ ); + /*JLoc->y = compute( coeffp_, Jy1D, ip_ ); JLoc->z = compute( coeffp_, Jz1D, ip_ ); ( *RhoLoc ) = compute( coeffp_, Rho1D, ip_ ); @@ -105,7 +161,7 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & if (smpi->use_BTIS3){ *( BLocyBTIS3+0*nparts ) = compute( &coeffp_[1], By1DBTIS3, ip_ ); *( BLoczBTIS3+0*nparts ) = compute( &coeffp_[1], Bz1DBTIS3, ip_ ); - } + }*/ } @@ -113,12 +169,20 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & void Interpolator1D2Order::oneField( Field **field, Particles &particles, int *istart, int *iend, double *FieldLoc, double *, double *, double * ) { Field1D *F = static_cast( *field ); - double *coeff = F->isDual( 0 ) ? coeffd_ : coeffp_; - int *i = F->isDual( 0 ) ? &id_ : &ip_; + + int idx_p[1], idx_d[1]; + double delta_p[1]; + double coeffxp[3]; + double coeffxd[3]; + + double *coeff = F->isDual( 0 ) ? &coeffxd[1] : &coeffxp[1];//coeffd_ : coeffp_; + int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; //&id_ : &ip_; for( int ipart=*istart ; ipart<*iend; ipart++ ) { - double xjn = particles.position( 0, ipart )*dx_inv_; - coeffs( xjn ); + double xpn = particles.position( 0, ipart )*dx_inv_; + + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + //coeffs( xpn ); FieldLoc[ipart] = compute( coeff, F, *i ); } } @@ -214,18 +278,26 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, #pragma acc exit data delete(this) #endif - } else { // with B-TIS3 interpolation - double *const __restrict__ BypartBTIS3 = smpi->dynamics_Bpart_yBTIS3[ithread].data(); - double *const __restrict__ BzpartBTIS3 = smpi->dynamics_Bpart_zBTIS3[ithread].data(); + }else { + + double *BypartBTIS3 = &( smpi->dynamics_Bpart_yBTIS3[ithread][0] ); + double *BzpartBTIS3 = &( smpi->dynamics_Bpart_zBTIS3[ithread][0] ); //*/ + //double *const __restrict__ BypartBTIS3 = smpi->dynamics_Bpart_yBTIS3[ithread].data(); + //double *const __restrict__ BzpartBTIS3 = smpi->dynamics_Bpart_zBTIS3[ithread].data(); + const double *const __restrict__ By1D_mBTIS3 = static_cast( EMfields->By_mBTIS3 )->data(); const double *const __restrict__ Bz1D_mBTIS3 = static_cast( EMfields->Bz_mBTIS3 )->data(); + + //double *const __restrict__ ELoc = smpi->dynamics_Epart[ithread].data(); + + #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target map( to : i_domain_begin_) is_device_ptr ( position_x) #pragma omp teams distribute parallel for #elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) - size_t interpolation_range_size = ( last_index + 1 * nparts ) - first_index; + size_t interpolation_range_size = ( last_index + 0 * nparts ) - first_index; #pragma acc parallel present(ELoc [first_index:interpolation_range_size],\ BLoc [first_index:interpolation_range_size],\ BypartBTIS3 [first_index:interpolation_range_size],\ @@ -243,17 +315,11 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, deviceptr(position_x) \ copyin(d_inv_) #pragma acc loop gang worker vector -#endif - - // would it be possile to just use another #pragma acc parallel present( - // for By1D_mBTIS3 [0:sizeofEz],\ Bz1D_mBTIS3 [0:sizeofEy])\ BypartBTIS3 [first_index:interpolation_range_size],\ - BzpartBTIS3 [first_index:interpolation_range_size],\ - // ? - +#endif //*/ for (int ipart=*istart; ipart < *iend; ipart++){ // Normalized particle position - double xpn = position_x[ipart] * dx_inv_;//particles.position( 0, ipart )*dx_inv_; + double xpn = particles.position( 0, ipart )*dx_inv_; // Calculate coeffs int idx_p[1], idx_d[1]; @@ -276,19 +342,21 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, // Interpolation of Bz^(d) BLoc[2*nparts+ipart] = compute( coeffxd, Bz1D, idx_d[0] ); // Interpolation of ByBTIS3^(p) - BypartBTIS3[0*nparts+ipart ] = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); + *( BypartBTIS3+0*nparts ) = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); + // Interpolation of BzBTIS3^(p) + *( BzpartBTIS3+0*nparts ) = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); + // Interpolation of ByBTIS3^(p) + //BypartBTIS3[0*nparts+ipart ] = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); // Interpolation of BzBTIS3^(p) - BzpartBTIS3[0*nparts+ipart ] = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); - + //BzpartBTIS3[0*nparts+ipart ] = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); //Buffering of iol and delta iold[0*nparts+ipart] = idx_p[0]; delta[0*nparts+ipart] = delta_p[0]; - } // end ipart loop #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) - #endif + #endif } // end with B-TIS interpolation } @@ -492,43 +560,45 @@ void Interpolator1D2Order::envelopeAndSusceptibility( ElectroMagn *EMfields, Par // Normalized particle position double xpn = particles.position( 0, ipart )*dx_inv_; + // Calculate coeffs + double coeffxp[3]; + // Indexes of the central nodes - ip_ = round( xpn ); + int ip = round( xpn ); // Declaration and calculation of the coefficient for interpolation double deltax, delta2; - - deltax = xpn - ( double )ip_; + deltax = xpn - ( double )ip; delta2 = deltax*deltax; - coeffp_[0] = 0.5 * ( delta2-deltax+0.25 ); - coeffp_[1] = 0.75 - delta2; - coeffp_[2] = 0.5 * ( delta2+deltax+0.25 ); + coeffxp[0] = 0.5 * ( delta2-deltax+0.25 ); + coeffxp[1] = 0.75 - delta2; + coeffxp[2] = 0.5 * ( delta2+deltax+0.25 ); //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - i_domain_begin_; + ip = ip - i_domain_begin_; // ------------------------- // Interpolation of Env_A_abs_^(p) // ------------------------- - *( Env_A_abs_Loc ) = compute( coeffp_, Env_A_abs_1D, ip_ ); //compute( &coeffp_[1], Env_A_abs_1D, ip_ ); + *( Env_A_abs_Loc ) = compute( coeffxp, Env_A_abs_1D, ip ); //compute( &coeffp_[1], Env_A_abs_1D, ip_ ); // ------------------------- // Interpolation of Env_Chi_^(p) // ------------------------- - *( Env_Chi_Loc ) = compute( coeffp_, Env_Chi_1D, ip_ ); //compute( &coeffp_[1], Env_Chi_1D, ip_ ); + *( Env_Chi_Loc ) = compute( coeffxp, Env_Chi_1D, ip ); //compute( &coeffp_[1], Env_Chi_1D, ip_ ); // ------------------------- // Interpolation of Env_E_abs_^(p) // ------------------------- - *( Env_E_abs_Loc ) = compute( coeffp_, Env_E_abs_1D, ip_ ); // compute( &coeffp_[1], Env_E_abs_1D, ip_ ); + *( Env_E_abs_Loc ) = compute( coeffxp, Env_E_abs_1D, ip ); // compute( &coeffp_[1], Env_E_abs_1D, ip_ ); // ------------------------- // Interpolation of Env_Ex_abs_^(p) // ------------------------- - *( Env_Ex_abs_Loc ) = compute( coeffp_, Env_Ex_abs_1D, ip_ ); // compute( &coeffp_[1], Env_Ex_abs_1D, ip_ ); + *( Env_Ex_abs_Loc ) = compute( coeffxp, Env_Ex_abs_1D, ip ); // compute( &coeffp_[1], Env_Ex_abs_1D, ip_ ); } // END Interpolator1D2Order diff --git a/src/Interpolator/Interpolator1D2Order.h b/src/Interpolator/Interpolator1D2Order.h index 9a1b2a9e4..44e6651d4 100755 --- a/src/Interpolator/Interpolator1D2Order.h +++ b/src/Interpolator/Interpolator1D2Order.h @@ -48,34 +48,6 @@ class Interpolator1D2Order final : public Interpolator1D void envelopeFieldForIonization( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, int ipart_ref = 0 ) override final; private: - inline void __attribute__((always_inline)) coeffs( double xjn ) - { - double xjmxi2; - - // Dual - id_ = std::round( xjn + 0.5 ); // index of the central point - xjmxi = xjn - static_cast(id_) + 0.5; // normalized distance to the central node - xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the central node - - // 2nd order interpolation on 3 nodes - coeffd_[0] = 0.5 * ( xjmxi2-xjmxi + 0.25 ); - coeffd_[1] = ( 0.75 - xjmxi2 ); - coeffd_[2] = 0.5 * ( xjmxi2+xjmxi + 0.25 ); - - id_ -= i_domain_begin_; - - // Primal - ip_ = std::round( xjn ); // index of the central point - xjmxi = xjn - static_cast(ip_); // normalized distance to the central node - xjmxi2 = xjmxi * xjmxi; // square of the normalized distance to the central node - - // 2nd order interpolation on 3 nodes - coeffp_[0] = 0.5 * ( xjmxi2 - xjmxi + 0.25 ); - coeffp_[1] = ( 0.75 - xjmxi2 ); - coeffp_[2] = 0.5 * ( xjmxi2 + xjmxi + 0.25 ); - - ip_ -= i_domain_begin_; - } // 2nd order interpolation on 3 nodes SMILEI_ACCELERATOR_DECLARE_ROUTINE @@ -109,18 +81,6 @@ class Interpolator1D2Order final : public Interpolator1D } SMILEI_ACCELERATOR_DECLARE_ROUTINE_END - // Last prim index computed - int ip_; - // Last dual index computed - int id_; - // Last delta computed - double xjmxi; - // Interpolation coefficient on Prim grid - double coeffp_[3]; - // Interpolation coefficient on Dual grid - double coeffd_[3]; - - };//END class #endif From 04564b0819fc841d6165b816aee05889d7cf2be5 Mon Sep 17 00:00:00 2001 From: "charles.prouveur" Date: Tue, 4 Jun 2024 13:00:57 +0200 Subject: [PATCH 44/54] fixing previous commit --- src/Interpolator/Interpolator1D2Order.cpp | 123 ++++++++-------------- 1 file changed, 44 insertions(+), 79 deletions(-) diff --git a/src/Interpolator/Interpolator1D2Order.cpp b/src/Interpolator/Interpolator1D2Order.cpp index b21954702..cc0a62cff 100755 --- a/src/Interpolator/Interpolator1D2Order.cpp +++ b/src/Interpolator/Interpolator1D2Order.cpp @@ -8,7 +8,6 @@ #include "Particles.h" #include "LaserEnvelope.h" - using namespace std; Interpolator1D2Order::Interpolator1D2Order( Params ¶ms, Patch *patch ) : Interpolator1D( patch ) @@ -30,18 +29,6 @@ void Interpolator1D2Order::fields( ElectroMagn *EMfields, Particles &particles, Field1D *Bz1D = static_cast( EMfields->Bz_m ); // Particle position (in units of the spatial-step) double xpn = particles.position( 0, ipart ) * dx_inv_; - // Calculate coeffs - /*coeffs( xjn ); - - // Interpolate the fields from the Dual grid : Ex, By, Bz - *( ELoc+0*nparts ) = compute( coeffd_, Ex1D, id_ ); - *( BLoc+1*nparts ) = compute( coeffd_, By1D_m, id_ ); - *( BLoc+2*nparts ) = compute( coeffd_, Bz1D_m, id_ ); - - // Interpolate the fields from the Primal grid : Ey, Ez, Bx - *( ELoc+1*nparts ) = compute( coeffp_, Ey1D, ip_ ); - *( ELoc+2*nparts ) = compute( coeffp_, Ez1D, ip_ ); - *( BLoc+0*nparts ) = compute( coeffp_, Bx1D_m, ip_ );*/ int idx_p[1], idx_d[1]; double delta_p[1]; @@ -61,7 +48,18 @@ void Interpolator1D2Order::fields( ElectroMagn *EMfields, Particles &particles, // Interpolation of By^(d) BLoc[1*nparts+ipart] = compute( &coeffxd[0], By1D, idx_d[0] ); // Interpolation of Bz^(d) - BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] ); + BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] );//*/ + +/* + // Interpolate the fields from the Dual grid : Ex, By, Bz + *( ELoc+0*nparts ) = compute( coeffxd, Ex1D, idx_d[0] ); + *( BLoc+1*nparts ) = compute( coeffxd, By1D, idx_d[0] ); + *( BLoc+2*nparts ) = compute( coeffxd, Bz1D, idx_d[0] ); + + // Interpolate the fields from the Primal grid : Ey, Ez, Bx + *( ELoc+1*nparts ) = compute( coeffxp, Ey1D, idx_p[0] ); + *( ELoc+2*nparts ) = compute( coeffxp, Ez1D, idx_p[0] ); + *( BLoc+0*nparts ) = compute( coeffxp, Bx1D, idx_p[0] );*/ }//END Interpolator1D2Order @@ -79,16 +77,16 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & } // Static cast of the electromagnetic fields - Field1D *Ex1D = static_cast( EMfields->Ex_ ); - Field1D *Ey1D = static_cast( EMfields->Ey_ ); - Field1D *Ez1D = static_cast( EMfields->Ez_ ); - Field1D *Bx1D = static_cast( EMfields->Bx_m ); - Field1D *By1D = static_cast( EMfields->By_m ); - Field1D *Bz1D = static_cast( EMfields->Bz_m ); - Field1D *Jx1D = static_cast( EMfields->Jx_ ); - Field1D *Jy1D = static_cast( EMfields->Jy_ ); - Field1D *Jz1D = static_cast( EMfields->Jz_ ); - Field1D *Rho1D = static_cast( EMfields->rho_ ); + Field1D *Ex1D = static_cast( EMfields->Ex_ ); + Field1D *Ey1D = static_cast( EMfields->Ey_ ); + Field1D *Ez1D = static_cast( EMfields->Ez_ ); + Field1D *Bx1D = static_cast( EMfields->Bx_m ); + Field1D *By1D = static_cast( EMfields->By_m ); + Field1D *Bz1D = static_cast( EMfields->Bz_m ); + Field1D *Jx1D = static_cast( EMfields->Jx_ ); + Field1D *Jy1D = static_cast( EMfields->Jy_ ); + Field1D *Jz1D = static_cast( EMfields->Jz_ ); + Field1D *Rho1D = static_cast( EMfields->rho_ ); Field1D *By1DBTIS3; Field1D *Bz1DBTIS3; if (smpi->use_BTIS3){ @@ -99,69 +97,40 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & // Particle position (in units of the spatial-step) double xpn = particles.position( 0, ipart )*dx_inv_; // Calculate coeffs - //coeffs( xjn ); + int idx_p[1], idx_d[1]; + double delta_p[1]; + double coeffxp[3]; + double coeffxd[3]; + + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); int nparts( particles.numberOfParticles() ); - /* // Interpolate the fields from the Dual grid : Ex, By, Bz - *( ELoc+0*nparts ) = compute( coeffd_, Ex1D, id_ ); - *( BLoc+1*nparts ) = compute( coeffd_, By1D_m, id_ ); - *( BLoc+2*nparts ) = compute( coeffd_, Bz1D_m, id_ ); + *( ELoc+0*nparts ) = compute( coeffxd, Ex1D, idx_d[0] ); + *( BLoc+1*nparts ) = compute( coeffxd, By1D, idx_d[0] ); + *( BLoc+2*nparts ) = compute( coeffxd, Bz1D, idx_d[0] ); // Interpolate the fields from the Primal grid : Ey, Ez, Bx - *( ELoc+1*nparts ) = compute( coeffp_, Ey1D, ip_ ); - *( ELoc+2*nparts ) = compute( coeffp_, Ez1D, ip_ ); - *( BLoc+0*nparts ) = compute( coeffp_, Bx1D_m, ip_ );*/ + *( ELoc+1*nparts ) = compute( coeffxp, Ey1D, idx_p[0] ); + *( ELoc+2*nparts ) = compute( coeffxp, Ez1D, idx_p[0] ); + *( BLoc+0*nparts ) = compute( coeffxp, Bx1D, idx_p[0] );//*/ - int idx_p[1], idx_d[1]; - double delta_p[1]; - double coeffxp[3]; - double coeffxd[3]; - - coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + // Interpolate the fields from the Primal grid : Jy, Jz, Rho + JLoc->y = compute( coeffxp, Jy1D, idx_p[0] ); + JLoc->z = compute( coeffxp, Jz1D, idx_p[0] ); + ( *RhoLoc ) = compute( coeffxp, Rho1D, idx_p[0] ); - // Interpolation of Ex^(d) - ELoc[0*nparts+ipart] = compute( &coeffxd[0], Ex1D, idx_d[0] ); - // Interpolation of Ey^(p) - ELoc[1*nparts+ipart] = compute( &coeffxp[0], Ey1D, idx_p[0] ); - // Interpolation of Ez^(p) - ELoc[2*nparts+ipart] = compute( &coeffxp[0], Ez1D, idx_p[0] ); - // Interpolation of Bx^(p) - BLoc[0*nparts+ipart] = compute( &coeffxp[0], Bx1D, idx_p[0] ); - // Interpolation of By^(d) - BLoc[1*nparts+ipart] = compute( &coeffxd[0], By1D, idx_d[0] ); - // Interpolation of Bz^(d) - BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] ); - - // Interpolation of Jx^(d,p) - JLoc->x = compute( &coeffxd[1], Jx1D, idx_d[0] ); - // Interpolation of Jy^(p,d) - JLoc->y = compute( &coeffxp[1], Jy1D, idx_p[0] ); - // Interpolation of Jz^(p,p) - JLoc->z = compute( &coeffxp[1], Jz1D, idx_p[0] ); - // Interpolation of Rho^(p,p) - ( *RhoLoc ) = compute( &coeffxp[1], Rho1D, idx_p[0]); + // Interpolate the fields from the Dual grid : Jx + JLoc->x = compute( coeffxd, Jx1D, idx_d[0] ); if (smpi->use_BTIS3){ // Interpolation of ByBTIS3^(p,p) - *( BLocyBTIS3+0*nparts ) = compute( &coeffxp[1], By1DBTIS3, idx_p[0]); + *( BLocyBTIS3+0*nparts ) = compute( &coeffxp[0], By1DBTIS3, idx_p[0]); // Interpolation of BzBTIS3^(p,d) - *( BLoczBTIS3+0*nparts ) = compute( &coeffxp[1], Bz1DBTIS3, idx_p[0]); + *( BLoczBTIS3+0*nparts ) = compute( &coeffxp[0], Bz1DBTIS3, idx_p[0]); } - // Interpolate the fields from the Primal grid : Jy, Jz, Rho - /*JLoc->y = compute( coeffp_, Jy1D, ip_ ); - JLoc->z = compute( coeffp_, Jz1D, ip_ ); - ( *RhoLoc ) = compute( coeffp_, Rho1D, ip_ ); - - // Interpolate the fields from the Dual grid : Jx - JLoc->x = compute( coeffd_, Jx1D, id_ ); - - if (smpi->use_BTIS3){ - *( BLocyBTIS3+0*nparts ) = compute( &coeffp_[1], By1DBTIS3, ip_ ); - *( BLoczBTIS3+0*nparts ) = compute( &coeffp_[1], Bz1DBTIS3, ip_ ); - }*/ } @@ -169,13 +138,11 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & void Interpolator1D2Order::oneField( Field **field, Particles &particles, int *istart, int *iend, double *FieldLoc, double *, double *, double * ) { Field1D *F = static_cast( *field ); - int idx_p[1], idx_d[1]; double delta_p[1]; double coeffxp[3]; double coeffxd[3]; - - double *coeff = F->isDual( 0 ) ? &coeffxd[1] : &coeffxp[1];//coeffd_ : coeffp_; + double *coeff = F->isDual( 0 ) ? coeffxd : coeffxp; int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; //&id_ : &ip_; for( int ipart=*istart ; ipart<*iend; ipart++ ) { @@ -288,9 +255,7 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, const double *const __restrict__ By1D_mBTIS3 = static_cast( EMfields->By_mBTIS3 )->data(); const double *const __restrict__ Bz1D_mBTIS3 = static_cast( EMfields->Bz_mBTIS3 )->data(); - //double *const __restrict__ ELoc = smpi->dynamics_Epart[ithread].data(); - - +/* #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target map( to : i_domain_begin_) is_device_ptr ( position_x) #pragma omp teams distribute parallel for From 7b237fe2a78b573a324fd72f16fdce3d498f8f81 Mon Sep 17 00:00:00 2001 From: Arnaud Beck Date: Tue, 4 Jun 2024 16:16:45 +0200 Subject: [PATCH 45/54] Update partners --- doc/Sphinx/Overview/partners.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/Sphinx/Overview/partners.rst b/doc/Sphinx/Overview/partners.rst index 69b87e746..87d9d978a 100755 --- a/doc/Sphinx/Overview/partners.rst +++ b/doc/Sphinx/Overview/partners.rst @@ -52,11 +52,11 @@ Partners | | `Maison de la Simulation `_ (MdlS), USR 3441 | | | | + +---------------------------------------------------------------------------------------------------------+ -| | * `Olga Abramkina `_ | -| | * `Julien Dérouillat `_ | +| | * `Olga Abramkina `_ (Developer) | +| | * `Julien Dérouillat `_ (Cofounder) | | | * `Haithem Kallala `_ | -| | * `Mathieu Lobet `_ | -| | * `Charles Prouveur `_ | +| | * `Mathieu Lobet `_ (Developer) | +| | * `Charles Prouveur `_ (Architect) | | | | +------------+---------------------------------------------------------------------------------------------------------+ @@ -67,11 +67,11 @@ Partners | | `Laboratoire pour l'Utilisation des Lasers Intenses `_ (LULI), UMR 7605 | | | | + +-------------------------------------------------------------------------------------------------------------+ -| | * `Mickael Grech `_ | -| | * `Tommaso Vinci `_ | +| | * `Mickael Grech `_ (Founder) | +| | * `Tommaso Vinci `_ (Developer) | | | * `Marco Chiaramello `_ | | | * `Anna Grassi `_ | -| | * `Frédéric Pérez `_ | +| | * `Frédéric Pérez `_ (Community manager, Developer) | | | * `Caterina Riconda `_ | | | | +------------+-------------------------------------------------------------------------------------------------------------+ @@ -83,9 +83,9 @@ Partners | | `Laboratoire Leprince-Ringuet `_ (LLR), UMR 7638 | + +---------------------------------------------------------------------------------------------------------+ | | | -| | * `Arnaud Beck `_ | +| | * `Arnaud Beck `_ (Project Coordinator, Cofounder, Developer) | | | * `Imen Zemzemi `_ | -| | * `Guillaume Bouchard `_ | +| | * `Guillaume Bouchard `_ (Developer) | +------------+---------------------------------------------------------------------------------------------------------+ .. rst-class:: noborder @@ -95,7 +95,7 @@ Partners | | `Laboratoire de Physique des Gaz et des Plasmas `_ (LPGP), UMR 8578 | + +----------------------------------------------------------------------------------------------------------------------+ | | | -| | * `Francesco Massimo `_ | +| | * `Francesco Massimo `_ (Developer) | +------------+----------------------------------------------------------------------------------------------------------------------+ .. rst-class:: noborder @@ -105,7 +105,7 @@ Partners | | `Institut du developpement et des ressources en informatique scientifique `_ (IDRIS), UPS 851 | + +----------------------------------------------------------------------------------------------------------------------+ | | | -| | * `Olga Abramkina `_ | +| | * `Olga Abramkina `_ (Developer) | | | * `Marie Flé `_ | +------------+----------------------------------------------------------------------------------------------------------------------+ From 59ce0096400de28d9dbe18732cf52467a8b71cfe Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Tue, 4 Jun 2024 16:44:10 +0200 Subject: [PATCH 46/54] fixes for sphinx >= 5 --- doc/Sphinx/smilei_theme/layout.html | 2 +- doc/Sphinx/smilei_theme/static/smilei_theme.css_t | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/Sphinx/smilei_theme/layout.html b/doc/Sphinx/smilei_theme/layout.html index 592f7e532..1bed82e81 100755 --- a/doc/Sphinx/smilei_theme/layout.html +++ b/doc/Sphinx/smilei_theme/layout.html @@ -97,7 +97,7 @@
diff --git a/doc/Sphinx/smilei_theme/static/smilei_theme.css_t b/doc/Sphinx/smilei_theme/static/smilei_theme.css_t index fdf918810..4de1a7428 100755 --- a/doc/Sphinx/smilei_theme/static/smilei_theme.css_t +++ b/doc/Sphinx/smilei_theme/static/smilei_theme.css_t @@ -172,6 +172,10 @@ a:hover { text-decoration: underline; } +a:visited { + color:{{ theme_main_color_bold }}; color: var(--main_bold); +} + div.body h1, div.body h2, div.body h3, @@ -402,6 +406,14 @@ table.footnote td { padding: 0.3em 0.5em; } +table.noborder { + width: 100%; +} + +table.noborder tr:first-child td:first-child { + width: 7em; +} + table.noborder, table.noborder td { border:0 !important; } From 51edda886db7b25ce74ff8c87915c2406920bce1 Mon Sep 17 00:00:00 2001 From: "charles.prouveur" Date: Tue, 4 Jun 2024 17:34:03 +0200 Subject: [PATCH 47/54] cleaning --- src/Interpolator/Interpolator1D2Order.cpp | 75 ++++++----------------- 1 file changed, 19 insertions(+), 56 deletions(-) diff --git a/src/Interpolator/Interpolator1D2Order.cpp b/src/Interpolator/Interpolator1D2Order.cpp index cc0a62cff..0d192afff 100755 --- a/src/Interpolator/Interpolator1D2Order.cpp +++ b/src/Interpolator/Interpolator1D2Order.cpp @@ -27,9 +27,10 @@ void Interpolator1D2Order::fields( ElectroMagn *EMfields, Particles &particles, Field1D *Bx1D = static_cast( EMfields->Bx_m ); Field1D *By1D = static_cast( EMfields->By_m ); Field1D *Bz1D = static_cast( EMfields->Bz_m ); + // Particle position (in units of the spatial-step) double xpn = particles.position( 0, ipart ) * dx_inv_; - + // Calculate coeffs int idx_p[1], idx_d[1]; double delta_p[1]; double coeffxp[3]; @@ -48,18 +49,7 @@ void Interpolator1D2Order::fields( ElectroMagn *EMfields, Particles &particles, // Interpolation of By^(d) BLoc[1*nparts+ipart] = compute( &coeffxd[0], By1D, idx_d[0] ); // Interpolation of Bz^(d) - BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] );//*/ - -/* - // Interpolate the fields from the Dual grid : Ex, By, Bz - *( ELoc+0*nparts ) = compute( coeffxd, Ex1D, idx_d[0] ); - *( BLoc+1*nparts ) = compute( coeffxd, By1D, idx_d[0] ); - *( BLoc+2*nparts ) = compute( coeffxd, Bz1D, idx_d[0] ); - - // Interpolate the fields from the Primal grid : Ey, Ez, Bx - *( ELoc+1*nparts ) = compute( coeffxp, Ey1D, idx_p[0] ); - *( ELoc+2*nparts ) = compute( coeffxp, Ez1D, idx_p[0] ); - *( BLoc+0*nparts ) = compute( coeffxp, Bx1D, idx_p[0] );*/ + BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] ); }//END Interpolator1D2Order @@ -143,11 +133,10 @@ void Interpolator1D2Order::oneField( Field **field, Particles &particles, int *i double coeffxp[3]; double coeffxd[3]; double *coeff = F->isDual( 0 ) ? coeffxd : coeffxp; - int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; //&id_ : &ip_; + int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; //&id_ : &ip_; for( int ipart=*istart ; ipart<*iend; ipart++ ) { double xpn = particles.position( 0, ipart )*dx_inv_; - coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); //coeffs( xpn ); FieldLoc[ipart] = compute( coeff, F, *i ); @@ -491,9 +480,8 @@ void Interpolator1D2Order::timeCenteredEnvelope( ElectroMagn *EMfields, Particle int idx_p[1], idx_d[1]; double delta_p[1]; double coeffxp[3]; - double coeffxd[3]; - coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + coeffs( xpn, idx_p, NULL, coeffxp, NULL, delta_p ); // Interpolation of Phi^(p) *( PHI_mpart+0*nparts+ipart ) = compute( coeffxp, Phi_m1D, idx_d[0] ); @@ -525,50 +513,37 @@ void Interpolator1D2Order::envelopeAndSusceptibility( ElectroMagn *EMfields, Par // Normalized particle position double xpn = particles.position( 0, ipart )*dx_inv_; - // Calculate coeffs - double coeffxp[3]; - // Indexes of the central nodes - int ip = round( xpn ); - - // Declaration and calculation of the coefficient for interpolation - double deltax, delta2; - - deltax = xpn - ( double )ip; - delta2 = deltax*deltax; - coeffxp[0] = 0.5 * ( delta2-deltax+0.25 ); - coeffxp[1] = 0.75 - delta2; - coeffxp[2] = 0.5 * ( delta2+deltax+0.25 ); - - - //!\todo CHECK if this is correct for both primal & dual grids !!! - // First index for summation - ip = ip - i_domain_begin_; + int idx_p[1]; + double delta_p[1]; + double coeffxp[3]; + coeffs( xpn, idx_p, NULL, coeffxp, NULL, delta_p ); // ------------------------- // Interpolation of Env_A_abs_^(p) // ------------------------- - *( Env_A_abs_Loc ) = compute( coeffxp, Env_A_abs_1D, ip ); //compute( &coeffp_[1], Env_A_abs_1D, ip_ ); + *( Env_A_abs_Loc ) = compute( coeffxp, Env_A_abs_1D, idx_p[0] ); // ------------------------- // Interpolation of Env_Chi_^(p) // ------------------------- - *( Env_Chi_Loc ) = compute( coeffxp, Env_Chi_1D, ip ); //compute( &coeffp_[1], Env_Chi_1D, ip_ ); + *( Env_Chi_Loc ) = compute( coeffxp, Env_Chi_1D, idx_p[0] ); // ------------------------- // Interpolation of Env_E_abs_^(p) // ------------------------- - *( Env_E_abs_Loc ) = compute( coeffxp, Env_E_abs_1D, ip ); // compute( &coeffp_[1], Env_E_abs_1D, ip_ ); + *( Env_E_abs_Loc ) = compute( coeffxp, Env_E_abs_1D, idx_p[0] ); // ------------------------- // Interpolation of Env_Ex_abs_^(p) // ------------------------- - *( Env_Ex_abs_Loc ) = compute( coeffxp, Env_Ex_abs_1D, ip ); // compute( &coeffp_[1], Env_Ex_abs_1D, ip_ ); + *( Env_Ex_abs_Loc ) = compute( coeffxp, Env_Ex_abs_1D, idx_p[0] ); } // END Interpolator1D2Order void Interpolator1D2Order::envelopeFieldForIonization( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, int ) { + // Static cast of the envelope fields Field1D *Env_Eabs = static_cast( EMfields->Env_E_abs_ ); @@ -577,32 +552,20 @@ void Interpolator1D2Order::envelopeFieldForIonization( ElectroMagn *EMfields, Pa //Loop on bin particles for( int ipart=*istart ; ipart<*iend; ipart++ ) { - int idx_p[1]; - double delta_p[1]; - double coeffxp[3]; - // Normalized particle position double xpn = particles.position( 0, ipart )*dx_inv_; - double delta2; - - // Primal - idx_p[0] = round( xpn ); // index of the central point - delta_p[0] = xpn -( double )idx_p[0]; // normalized distance to the central node - delta2 = pow( delta_p[0], 2 ); // square of the normalized distance to the central node - - // 2nd order interpolation on 3 nodes - coeffxp[0] = 0.5 * ( delta2-delta_p[0]+0.25 ); - coeffxp[1] = ( 0.75-delta2 ); - coeffxp[2] = 0.5 * ( delta2+delta_p[0]+0.25 ); - - idx_p[0] -= i_domain_begin_; + int idx_p[1]; + double delta_p[1]; + double coeffxp[3]; + coeffs( xpn, idx_p, NULL, coeffxp, NULL, delta_p ); // --------------------------------- // Interpolation of Env_E_abs^(p) // --------------------------------- ( *Env_Eabs_part )[ipart] = compute( coeffxp, Env_Eabs, idx_p[0] ); + // In 1D the Env_Ex_abs field is always zero } From 13641d46e0f7055818e6560a66ef94f92ca6fd68 Mon Sep 17 00:00:00 2001 From: "charles.prouveur" Date: Tue, 4 Jun 2024 18:06:02 +0200 Subject: [PATCH 48/54] small change --- src/Interpolator/Interpolator1D2Order.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpolator/Interpolator1D2Order.cpp b/src/Interpolator/Interpolator1D2Order.cpp index 0d192afff..3625e98a0 100755 --- a/src/Interpolator/Interpolator1D2Order.cpp +++ b/src/Interpolator/Interpolator1D2Order.cpp @@ -132,7 +132,7 @@ void Interpolator1D2Order::oneField( Field **field, Particles &particles, int *i double delta_p[1]; double coeffxp[3]; double coeffxd[3]; - double *coeff = F->isDual( 0 ) ? coeffxd : coeffxp; + double *coeff = F->isDual( 0 ) ? &coeffxd[0] : &coeffxp[0]; int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; //&id_ : &ip_; for( int ipart=*istart ; ipart<*iend; ipart++ ) { From b7c9d9df1b8c9080efaed80784b64b00120d21be Mon Sep 17 00:00:00 2001 From: "charles.prouveur" Date: Wed, 5 Jun 2024 09:57:20 +0200 Subject: [PATCH 49/54] took out st1d_24_cir_plane_wave_BTIS3.py for further investigations --- benchmarks/tst1d_24_cir_plane_wave_BTIS3.py | 95 ------------- src/Interpolator/Interpolator1D2Order.cpp | 144 +++++++++++++------- src/Interpolator/Interpolator1D4Order.h | 86 +++--------- 3 files changed, 114 insertions(+), 211 deletions(-) delete mode 100755 benchmarks/tst1d_24_cir_plane_wave_BTIS3.py diff --git a/benchmarks/tst1d_24_cir_plane_wave_BTIS3.py b/benchmarks/tst1d_24_cir_plane_wave_BTIS3.py deleted file mode 100755 index cab778662..000000000 --- a/benchmarks/tst1d_24_cir_plane_wave_BTIS3.py +++ /dev/null @@ -1,95 +0,0 @@ -# _____________________________________________________________________________ -# -# Electron trajectory in a plane wave -# with a Gaussian temporal profile. -# -# Validation in the relativist regime -# -# _____________________________________________________________________________ - -import math - -# _____________________________________________________________________________ -# Main parameters - -l0 = 2.0*math.pi # laser wavelength -t0 = l0 # optical cicle -Lx = 80*l0 - -n0 = 1e-8 # particle density - -Tsim = 150.*t0 # duration of the simulation -resx = 64. # nb of cells in one laser wavelength - -dx = l0/resx # space step -dt = 0.95 * dx # timestep (0.95 x CFL) - -a0 = 5 -start = 0 # Laser start -fwhm = 10*t0 # Gaussian time fwhm -duration = 90*t0 # Laser duration -center = duration*0.5 # Laser profile center - -pusher = "borisBTIS3" - -# Density profile for inital location of the particles -def n0_(x): - if (dxy = compute( coeffxp, Jy1D, idx_p[0] ); - JLoc->z = compute( coeffxp, Jz1D, idx_p[0] ); + JLoc->y = compute( coeffxp, Jy1D, idx_p[0] ); + JLoc->z = compute( coeffxp, Jz1D, idx_p[0] ); ( *RhoLoc ) = compute( coeffxp, Rho1D, idx_p[0] ); // Interpolate the fields from the Dual grid : Jx JLoc->x = compute( coeffxd, Jx1D, idx_d[0] ); if (smpi->use_BTIS3){ - // Interpolation of ByBTIS3^(p,p) - *( BLocyBTIS3+0*nparts ) = compute( &coeffxp[0], By1DBTIS3, idx_p[0]); - // Interpolation of BzBTIS3^(p,d) - *( BLoczBTIS3+0*nparts ) = compute( &coeffxp[0], Bz1DBTIS3, idx_p[0]); + *( BLocyBTIS3+0*nparts ) = compute( coeffxp, By1DBTIS3, idx_p[0] ); + *( BLoczBTIS3+0*nparts ) = compute( coeffxp, Bz1DBTIS3, idx_p[0] ); } - - } // Interpolator on another field than the basic ones @@ -132,13 +152,12 @@ void Interpolator1D2Order::oneField( Field **field, Particles &particles, int *i double delta_p[1]; double coeffxp[3]; double coeffxd[3]; - double *coeff = F->isDual( 0 ) ? &coeffxd[0] : &coeffxp[0]; - int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; //&id_ : &ip_; + double *coeff = F->isDual( 0 ) ? coeffxd : coeffxp; + int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; for( int ipart=*istart ; ipart<*iend; ipart++ ) { double xpn = particles.position( 0, ipart )*dx_inv_; coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); - //coeffs( xpn ); FieldLoc[ipart] = compute( coeff, F, *i ); } } @@ -389,13 +408,13 @@ void Interpolator1D2Order::fieldsAndEnvelope( ElectroMagn *EMfields, Particles & // Interpolation of Bz^(d) *( Bpart+2*nparts+ipart ) = compute( coeffxd, Bz1D, idx_d[0] ); // Interpolation of Phi^(p) - *( PHIpart+0*nparts+ipart ) = compute( coeffxp, Phi1D, idx_d[0] ); + *( PHIpart+0*nparts+ipart ) = compute( coeffxp, Phi1D, idx_p[0] ); // Interpolation of GradPhix^(p) - *( GradPHIpart+0*nparts+ipart ) = compute( coeffxp, GradPhix1D, idx_d[0] ); + *( GradPHIpart+0*nparts+ipart ) = compute( coeffxp, GradPhix1D, idx_p[0] ); // Interpolation of GradPhiy^(p) - *( GradPHIpart+1*nparts+ipart ) = compute( coeffxp, GradPhiy1D, idx_d[0] ); + *( GradPHIpart+1*nparts+ipart ) = compute( coeffxp, GradPhiy1D, idx_p[0] ); // Interpolation of GradPhiz^(p) - *( GradPHIpart+2*nparts+ipart ) = compute( coeffxp, GradPhiz1D, idx_d[0] ); + *( GradPHIpart+2*nparts+ipart ) = compute( coeffxp, GradPhiz1D, idx_p[0] ); //Buffering of iol and delta *( iold+0*nparts+ipart) = idx_p[0]; @@ -433,17 +452,17 @@ void Interpolator1D2Order::fieldsAndEnvelope( ElectroMagn *EMfields, Particles & // Interpolation of Bz^(d) *( Bpart+2*nparts+ipart ) = compute( coeffxd, Bz1D, idx_d[0] ); // Interpolation of ByBTIS3^(p) - *( BypartBTIS3+0*nparts ) = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); + *( BypartBTIS3+0*nparts) = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); // Interpolation of BzBTIS3^(p) - *( BzpartBTIS3+0*nparts ) = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); + *( BzpartBTIS3+0*nparts) = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); // Interpolation of Phi^(p) - *( PHIpart+0*nparts+ipart ) = compute( coeffxp, Phi1D, idx_d[0] ); + *( PHIpart+0*nparts+ipart ) = compute( coeffxp, Phi1D, idx_p[0] ); // Interpolation of GradPhix^(p) - *( GradPHIpart+0*nparts+ipart ) = compute( coeffxp, GradPhix1D, idx_d[0] ); + *( GradPHIpart+0*nparts+ipart ) = compute( coeffxp, GradPhix1D, idx_p[0] ); // Interpolation of GradPhiy^(p) - *( GradPHIpart+1*nparts+ipart ) = compute( coeffxp, GradPhiy1D, idx_d[0] ); + *( GradPHIpart+1*nparts+ipart ) = compute( coeffxp, GradPhiy1D, idx_p[0] ); // Interpolation of GradPhiz^(p) - *( GradPHIpart+2*nparts+ipart ) = compute( coeffxp, GradPhiz1D, idx_d[0] ); + *( GradPHIpart+2*nparts+ipart ) = compute( coeffxp, GradPhiz1D, idx_p[0] ); //Buffering of iol and delta *( iold+0*nparts+ipart) = idx_p[0]; @@ -480,23 +499,22 @@ void Interpolator1D2Order::timeCenteredEnvelope( ElectroMagn *EMfields, Particle int idx_p[1], idx_d[1]; double delta_p[1]; double coeffxp[3]; + double coeffxd[3]; - coeffs( xpn, idx_p, NULL, coeffxp, NULL, delta_p ); + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); // Interpolation of Phi^(p) - *( PHI_mpart+0*nparts+ipart ) = compute( coeffxp, Phi_m1D, idx_d[0] ); + *( PHI_mpart+0*nparts+ipart ) = compute( coeffxp, Phi_m1D, idx_p[0] ); // Interpolation of GradPhix^(p) - *( GradPHI_mpart+0*nparts+ipart ) = compute( coeffxp, GradPhix_m1D, idx_d[0] ); + *( GradPHI_mpart+0*nparts+ipart ) = compute( coeffxp, GradPhix_m1D, idx_p[0] ); // Interpolation of GradPhiy^(p) - *( GradPHI_mpart+1*nparts+ipart ) = compute( coeffxp, GradPhiy_m1D, idx_d[0] ); + *( GradPHI_mpart+1*nparts+ipart ) = compute( coeffxp, GradPhiy_m1D, idx_p[0] ); // Interpolation of GradPhiz^(p) - *( GradPHI_mpart+2*nparts+ipart ) = compute( coeffxp, GradPhiz_m1D, idx_d[0] ); + *( GradPHI_mpart+2*nparts+ipart ) = compute( coeffxp, GradPhiz_m1D, idx_p[0] ); //Buffering of iol and delta *( iold+ipart+0*nparts) = idx_p[0]; *( delta+ipart+0*nparts) = delta_p[0]; - - } } // END Interpolator1D2Order @@ -513,37 +531,51 @@ void Interpolator1D2Order::envelopeAndSusceptibility( ElectroMagn *EMfields, Par // Normalized particle position double xpn = particles.position( 0, ipart )*dx_inv_; - // Indexes of the central nodes - int idx_p[1]; - double delta_p[1]; + // Calculate coeffs double coeffxp[3]; - coeffs( xpn, idx_p, NULL, coeffxp, NULL, delta_p ); + + // Indexes of the central nodes + int ip = round( xpn ); + + // Declaration and calculation of the coefficient for interpolation + double deltax, delta2; + + deltax = xpn - ( double )ip; + delta2 = deltax*deltax; + coeffxp[0] = 0.5 * ( delta2-deltax+0.25 ); + coeffxp[1] = 0.75 - delta2; + coeffxp[2] = 0.5 * ( delta2+deltax+0.25 ); + + + //!\todo CHECK if this is correct for both primal & dual grids !!! + // First index for summation + ip = ip - i_domain_begin_; // ------------------------- // Interpolation of Env_A_abs_^(p) // ------------------------- - *( Env_A_abs_Loc ) = compute( coeffxp, Env_A_abs_1D, idx_p[0] ); + *( Env_A_abs_Loc ) = compute( coeffxp, Env_A_abs_1D, ip ); //compute( &coeffp_[1], Env_A_abs_1D, ip_ ); // ------------------------- // Interpolation of Env_Chi_^(p) // ------------------------- - *( Env_Chi_Loc ) = compute( coeffxp, Env_Chi_1D, idx_p[0] ); + *( Env_Chi_Loc ) = compute( coeffxp, Env_Chi_1D, ip ); //compute( &coeffp_[1], Env_Chi_1D, ip_ ); // ------------------------- // Interpolation of Env_E_abs_^(p) // ------------------------- - *( Env_E_abs_Loc ) = compute( coeffxp, Env_E_abs_1D, idx_p[0] ); + *( Env_E_abs_Loc ) = compute( coeffxp, Env_E_abs_1D, ip ); // compute( &coeffp_[1], Env_E_abs_1D, ip_ ); // ------------------------- // Interpolation of Env_Ex_abs_^(p) // ------------------------- - *( Env_Ex_abs_Loc ) = compute( coeffxp, Env_Ex_abs_1D, idx_p[0] ); + *( Env_Ex_abs_Loc ) = compute( coeffxp, Env_Ex_abs_1D, ip ); // compute( &coeffp_[1], Env_Ex_abs_1D, ip_ ); + } // END Interpolator1D2Order void Interpolator1D2Order::envelopeFieldForIonization( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, int ) { - // Static cast of the envelope fields Field1D *Env_Eabs = static_cast( EMfields->Env_E_abs_ ); @@ -552,20 +584,32 @@ void Interpolator1D2Order::envelopeFieldForIonization( ElectroMagn *EMfields, Pa //Loop on bin particles for( int ipart=*istart ; ipart<*iend; ipart++ ) { - // Normalized particle position - double xpn = particles.position( 0, ipart )*dx_inv_; - int idx_p[1]; double delta_p[1]; double coeffxp[3]; - coeffs( xpn, idx_p, NULL, coeffxp, NULL, delta_p ); + + // Normalized particle position + double xpn = particles.position( 0, ipart )*dx_inv_; + + double delta2; + + // Primal + idx_p[0] = round( xpn ); // index of the central point + delta_p[0] = xpn -( double )idx_p[0]; // normalized distance to the central node + delta2 = pow( delta_p[0], 2 ); // square of the normalized distance to the central node + + // 2nd order interpolation on 3 nodes + coeffxp[0] = 0.5 * ( delta2-delta_p[0]+0.25 ); + coeffxp[1] = ( 0.75-delta2 ); + coeffxp[2] = 0.5 * ( delta2+delta_p[0]+0.25 ); + + idx_p[0] -= i_domain_begin_; // --------------------------------- // Interpolation of Env_E_abs^(p) // --------------------------------- ( *Env_Eabs_part )[ipart] = compute( coeffxp, Env_Eabs, idx_p[0] ); - // In 1D the Env_Ex_abs field is always zero } diff --git a/src/Interpolator/Interpolator1D4Order.h b/src/Interpolator/Interpolator1D4Order.h index 7bca2b949..0e8831091 100755 --- a/src/Interpolator/Interpolator1D4Order.h +++ b/src/Interpolator/Interpolator1D4Order.h @@ -33,64 +33,12 @@ class Interpolator1D4Order final : public Interpolator1D void envelopeAndSusceptibility( ElectroMagn *EMfields, Particles &particles, int ipart, double *Env_A_abs_Loc, double *Env_Chi_Loc, double *Env_E_abs_Loc, double *Env_Ex_abs_Loc ) override final; private: - inline void __attribute__((always_inline)) coeffs( double xjn ) - { - double xjmxi2, xjmxi3, xjmxi4; - - // Dual - id_ = round( xjn+0.5 ); // index of the central point - xjmxi = xjn -( double )id_+0.5; // normalized distance to the central node - xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the central node - xjmxi3 = xjmxi2*xjmxi; // cube of the normalized distance to the central node - xjmxi4 = xjmxi3*xjmxi; // 4th power of the normalized distance to the central node - - // coefficients for the 4th order interpolation on 5 nodes - coeffd_[0] = dble_1_ov_384 - dble_1_ov_48 * xjmxi + dble_1_ov_16 * xjmxi2 - dble_1_ov_12 * xjmxi3 + dble_1_ov_24 * xjmxi4; - coeffd_[1] = dble_19_ov_96 - dble_11_ov_24 * xjmxi + dble_1_ov_4 * xjmxi2 + dble_1_ov_6 * xjmxi3 - dble_1_ov_6 * xjmxi4; - coeffd_[2] = dble_115_ov_192 - dble_5_ov_8 * xjmxi2 + dble_1_ov_4 * xjmxi4; - coeffd_[3] = dble_19_ov_96 + dble_11_ov_24 * xjmxi + dble_1_ov_4 * xjmxi2 - dble_1_ov_6 * xjmxi3 - dble_1_ov_6 * xjmxi4; - coeffd_[4] = dble_1_ov_384 + dble_1_ov_48 * xjmxi + dble_1_ov_16 * xjmxi2 + dble_1_ov_12 * xjmxi3 + dble_1_ov_24 * xjmxi4; - - id_ -= i_domain_begin_; - - // Primal - ip_ = round( xjn ); // index of the central point - xjmxi = xjn -( double )ip_; // normalized distance to the central node - xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the central node - xjmxi3 = xjmxi2*xjmxi; // cube of the normalized distance to the central node - xjmxi4 = xjmxi3*xjmxi; // 4th power of the normalized distance to the central node - - // coefficients for the 4th order interpolation on 5 nodes - coeffp_[0] = dble_1_ov_384 - dble_1_ov_48 * xjmxi + dble_1_ov_16 * xjmxi2 - dble_1_ov_12 * xjmxi3 + dble_1_ov_24 * xjmxi4; - coeffp_[1] = dble_19_ov_96 - dble_11_ov_24 * xjmxi + dble_1_ov_4 * xjmxi2 + dble_1_ov_6 * xjmxi3 - dble_1_ov_6 * xjmxi4; - coeffp_[2] = dble_115_ov_192 - dble_5_ov_8 * xjmxi2 + dble_1_ov_4 * xjmxi4; - coeffp_[3] = dble_19_ov_96 + dble_11_ov_24 * xjmxi + dble_1_ov_4 * xjmxi2 - dble_1_ov_6 * xjmxi3 - dble_1_ov_6 * xjmxi4; - coeffp_[4] = dble_1_ov_384 + dble_1_ov_48 * xjmxi + dble_1_ov_16 * xjmxi2 + dble_1_ov_12 * xjmxi3 + dble_1_ov_24 * xjmxi4; - - ip_ -= i_domain_begin_; - } - inline void coeffs( double xpn, int* idx_p, int* idx_d, double *coeffxp, double *coeffxd, double* delta_p ) { double delta, delta2, delta3, delta4 ; - - // Dual - idx_d[0] = round( xpn+0.5 ); // index of the central point - delta = xpn -( double )idx_d[0]+0.5; // normalized distance to the central node - delta2 = delta*delta; // square of the normalized distance to the central node - delta3 = delta2*delta; // cube of the normalized distance to the central node - delta4 = delta3*delta; // 4th power of the normalized distance to the central node - - // coefficients for the 4th order interpolation on 5 nodes - coeffxd[0] = dble_1_ov_384 - dble_1_ov_48 * delta + dble_1_ov_16 * delta2 - dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; - coeffxd[1] = dble_19_ov_96 - dble_11_ov_24 * delta + dble_1_ov_4 * delta2 + dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; - coeffxd[2] = dble_115_ov_192 - dble_5_ov_8 * delta2 + dble_1_ov_4 * delta4; - coeffxd[3] = dble_19_ov_96 + dble_11_ov_24 * delta + dble_1_ov_4 * delta2 - dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; - coeffxd[4] = dble_1_ov_384 + dble_1_ov_48 * delta + dble_1_ov_16 * delta2 + dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; - - idx_d[0] -= i_domain_begin_; - + + // Primal idx_p[0] = round( xpn ); // index of the central point delta_p[0] = xpn -( double )idx_p[0]; // normalized distance to the central node @@ -106,6 +54,24 @@ class Interpolator1D4Order final : public Interpolator1D coeffxp[4] = dble_1_ov_384 + dble_1_ov_48 * delta_p[0] + dble_1_ov_16 * delta2 + dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; idx_p[0] -= i_domain_begin_; + + if(idx_d){ + // Dual + idx_d[0] = round( xpn+0.5 ); // index of the central point + delta = xpn -( double )idx_d[0]+0.5; // normalized distance to the central node + delta2 = delta*delta; // square of the normalized distance to the central node + delta3 = delta2*delta; // cube of the normalized distance to the central node + delta4 = delta3*delta; // 4th power of the normalized distance to the central node + + // coefficients for the 4th order interpolation on 5 nodes + coeffxd[0] = dble_1_ov_384 - dble_1_ov_48 * delta + dble_1_ov_16 * delta2 - dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; + coeffxd[1] = dble_19_ov_96 - dble_11_ov_24 * delta + dble_1_ov_4 * delta2 + dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; + coeffxd[2] = dble_115_ov_192 - dble_5_ov_8 * delta2 + dble_1_ov_4 * delta4; + coeffxd[3] = dble_19_ov_96 + dble_11_ov_24 * delta + dble_1_ov_4 * delta2 - dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; + coeffxd[4] = dble_1_ov_384 + dble_1_ov_48 * delta + dble_1_ov_16 * delta2 + dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; + + idx_d[0] -= i_domain_begin_; + } } double dble_1_ov_384 ; @@ -120,18 +86,6 @@ class Interpolator1D4Order final : public Interpolator1D double dble_115_ov_192 ; double dble_5_ov_8 ; - // Last prim index computed - int ip_; - // Last dual index computed - int id_; - // Last delta computed - double xjmxi; - // Interpolation coefficient on Prim grid - double coeffp_[5]; - // Interpolation coefficient on Dual grid - double coeffd_[5]; - - };//END class #endif From e41df5ea46947b5a4dd5f8b7a649b9a0ee3c186c Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Wed, 5 Jun 2024 14:49:27 +0200 Subject: [PATCH 50/54] Persistent buffers for GPU sorting --- src/Particles/nvidiaParticles.cu | 349 ++++++++++++------------------- src/Particles/nvidiaParticles.h | 36 +--- 2 files changed, 145 insertions(+), 240 deletions(-) diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index 42995603d..f84ed3463 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -313,27 +313,23 @@ namespace detail { particle_to_inject.scatterParticles( particle_container, available_places ); // If there are more imported particles than places, copy the remaining imported particles at the end if( inject_count >= erased_count ) { - particle_container.resize( new_count ); + particle_container.deviceResize( new_count ); particle_container.pasteParticles( &particle_to_inject, initial_count, erased_count ); // If there are more places than imported particles, the remaining places should be filled } else { const auto last_filled = available_places[inject_count]; particle_container.eraseParticlesByPredicate( cellKeyBelow<0>(), last_filled ); - particle_container.resize( new_count ); + particle_container.deviceResize( new_count ); } + particle_to_inject.deviceFree(); // Compute keys of particles computeParticleClusterKey( particle_container, parameters, a_parent_patch ); - // Use particle_to_inject as a buffer - particle_to_inject.softReserve( new_count ); - particle_to_inject.resize( new_count ); + // Sort particles by keys + particle_container.sortParticleByKey(); - // Sort particles using thrust::gather, according to the sorting map - // (particle_to_inject serves as a buffer) - particle_container.sortParticleByKey( particle_to_inject ); - - // Recompute bins + // Recompute bin locations computeBinIndex( particle_container ); } @@ -533,13 +529,7 @@ nvidiaParticles::~nvidiaParticles() { } } -void nvidiaParticles::resizeDimensions( unsigned int nDim ) -{ - nvidia_position_.resize( nDim ); - nvidia_momentum_.resize( 3 ); -} - -void nvidiaParticles::softReserve( unsigned int particle_count, float growth_factor ) +void nvidiaParticles::deviceReserve( unsigned int particle_count, float growth_factor ) { if( particle_count <= deviceCapacity() ) { // Dont reserve, for now we have enough capacity. @@ -548,23 +538,12 @@ void nvidiaParticles::softReserve( unsigned int particle_count, float growth_fac const unsigned int new_capacity = static_cast( particle_count * growth_factor ); - for( unsigned int idim = 0; idim < nvidia_position_.size(); idim++ ) { - nvidia_position_[idim].reserve( new_capacity ); - } - - for( unsigned int idim = 0; idim < 3; idim++ ) { - nvidia_momentum_[idim].reserve( new_capacity ); - } - - nvidia_weight_.reserve( new_capacity ); - nvidia_charge_.reserve( new_capacity ); - - if( has_quantum_parameter ) { - nvidia_chi_.reserve( new_capacity ); + for( auto prop: nvidia_double_prop_) { + prop->reserve( new_capacity ); } - if( has_Monte_Carlo_process ) { - nvidia_tau_.reserve( new_capacity ); + for( auto prop: nvidia_short_prop_ ) { + prop->reserve( new_capacity ); } if( tracked ) { @@ -572,138 +551,72 @@ void nvidiaParticles::softReserve( unsigned int particle_count, float growth_fac } nvidia_cell_keys_.reserve( new_capacity ); -} - -void nvidiaParticles::reserve( unsigned int particle_count ) -{ - for( unsigned int idim = 0; idim < nvidia_position_.size(); idim++ ) { - nvidia_position_[idim].reserve( particle_count ); - } - - for( unsigned int idim = 0; idim < 3; idim++ ) { - nvidia_momentum_[idim].reserve( particle_count ); - } - - nvidia_weight_.reserve( particle_count ); - nvidia_charge_.reserve( particle_count ); - if( has_quantum_parameter ) { - nvidia_chi_.reserve( particle_count ); + for( auto &v: double_buffers_ ) { + v.reserve( new_capacity ); } - - if( has_Monte_Carlo_process ) { - nvidia_tau_.reserve( particle_count ); + for( auto &v: short_buffers_ ) { + v.reserve( new_capacity ); } - - if( tracked ) { - nvidia_id_.reserve( particle_count ); + for( auto &v: uint64_buffers_ ) { + v.reserve( new_capacity ); } - nvidia_cell_keys_.reserve( particle_count ); } -void nvidiaParticles::resize( unsigned int particle_count ) +void nvidiaParticles::deviceFree() { - - // TODO(Etienne M): Use non-initializing vector/allocator (dont pay the cost - // of what you dont use) ? - - for( int idim = 0; idim < nvidia_position_.size(); idim++ ) { - nvidia_position_[idim].resize( particle_count ); - } - - for( int idim = 0; idim < 3; idim++ ) { - nvidia_momentum_[idim].resize( particle_count ); + for( auto prop: nvidia_double_prop_) { + thrust::device_vector().swap( *prop ); } - nvidia_weight_.resize( particle_count ); - nvidia_charge_.resize( particle_count ); - - if( has_quantum_parameter ) { - nvidia_chi_.resize( particle_count ); - } - - if( has_Monte_Carlo_process ) { - nvidia_tau_.resize( particle_count ); + for( auto prop: nvidia_short_prop_ ) { + thrust::device_vector().swap( *prop ); } if( tracked ) { - nvidia_id_.resize( particle_count ); - } - - nvidia_cell_keys_.resize( particle_count ); - - gpu_nparts_ = particle_count; -} - -void nvidiaParticles::free() -{ - for( auto& a_vector : nvidia_position_ ) { - thrust::device_vector a_dummy_vector{}; - std::swap( a_vector, a_dummy_vector ); + thrust::device_vector().swap( nvidia_id_ ); } - for( auto& a_vector : nvidia_momentum_ ) { - thrust::device_vector a_dummy_vector{}; - std::swap( a_vector, a_dummy_vector ); - } + thrust::device_vector().swap( nvidia_cell_keys_ ); - { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_weight_, a_dummy_vector ); + for( auto &v: double_buffers_ ) { + thrust::device_vector().swap( v ); } - - { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_charge_, a_dummy_vector ); + for( auto &v: short_buffers_ ) { + thrust::device_vector().swap( v ); } - - if( has_quantum_parameter ) { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_chi_, a_dummy_vector ); - } - - if( has_Monte_Carlo_process ) { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_tau_, a_dummy_vector ); - } - - if( tracked ) { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_id_, a_dummy_vector ); - } - - { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_cell_keys_, a_dummy_vector ); + for( auto &v: uint64_buffers_ ) { + thrust::device_vector().swap( v ); } gpu_nparts_ = 0; } -// --------------------------------------------------------------------------------------------------------------------- -//! Resize particle vectors -// --------------------------------------------------------------------------------------------------------------------- void nvidiaParticles::deviceResize( unsigned int new_size ) { - for( unsigned int iprop=0 ; ipropresize( new_size ); } - for( unsigned int iprop=0 ; ipropresize( new_size ); } - // - // for( unsigned int iprop=0 ; ipropclear(); + for( auto prop: nvidia_double_prop_ ) { + prop->clear(); } - for( unsigned int iprop = 0; iprop < nvidia_short_prop_.size(); iprop++ ) { - nvidia_short_prop_[iprop]->clear(); + for( auto prop: nvidia_short_prop_ ) { + prop->clear(); } // TODO(Etienne M): Clear cell keys too ? - if (tracked) { + if( tracked ) { nvidia_id_.clear(); } + for( auto &v: double_buffers_ ) { + v.clear(); + } + for( auto &v: short_buffers_ ) { + v.clear(); + } + for( auto &v: uint64_buffers_ ) { + v.clear(); + } + gpu_nparts_ = 0; } @@ -748,23 +671,18 @@ void nvidiaParticles::initializeDataOnDevice() // The world shall end if we call this function multiple times SMILEI_ASSERT( nvidia_double_prop_.empty() ); - const auto kPositionDimension = Position.size(); - // We sure that we have as many say, position dimension as the base class. - resizeDimensions( kPositionDimension ); + nvidia_position_.resize( Position.size() ); + nvidia_momentum_.resize( 3 ); // Initialize the list of pointers - - for( unsigned int i = 0; i < kPositionDimension; i++ ) { - nvidia_double_prop_.push_back( &nvidia_position_[i] ); + for( auto &pos: nvidia_position_ ) { + nvidia_double_prop_.push_back( &pos ); } - - for( unsigned int i = 0; i < 3; i++ ) { - nvidia_double_prop_.push_back( &nvidia_momentum_[i] ); + for( auto &mom: nvidia_momentum_ ) { + nvidia_double_prop_.push_back( &mom ); } - nvidia_double_prop_.push_back( &nvidia_weight_ ); - nvidia_short_prop_.push_back( &nvidia_charge_ ); // Quantum parameter (for QED effects): @@ -781,9 +699,9 @@ void nvidiaParticles::initializeDataOnDevice() nvidia_double_prop_.push_back( &nvidia_tau_ ); } - const auto kHostParticleCount = Position[0].size(); + const auto hostParticleCount = Position[0].size(); - if( kHostParticleCount == 0 ) { + if( hostParticleCount == 0 ) { // Should we reserve some space ? // reserve( 100 ); } else { @@ -805,6 +723,12 @@ void nvidiaParticles::initializeDataOnDevice() // setHostBinIndex(); } else { + // Allocate buffers that are necessary for sorting particles with binning + double_buffers_.resize( nvidia_double_prop_.size() ); + short_buffers_ .resize( nvidia_short_prop_ .size() ); + uint64_buffers_.resize( 2 ); + deviceResize( gpu_nparts_ ); // resizes the buffers + // At this point, a copy of the host particles and last_index is on the // device and we know we support the space dimension. @@ -832,7 +756,7 @@ void nvidiaParticles::initializeIDsOnDevice() // ------------------------------------------------------------------------------------------------- void nvidiaParticles::copyFromHostToDevice() { - resize( Position[0].size() ); + deviceResize( Position[0].size() ); for( int idim = 0; idim < Position.size(); idim++ ) { thrust::copy( Position[idim].begin(), Position[idim].end(), nvidia_position_[idim].begin() ); @@ -841,7 +765,6 @@ void nvidiaParticles::copyFromHostToDevice() for( int idim = 0; idim < Momentum.size(); idim++ ) { thrust::copy( Momentum[idim].begin(), Momentum[idim].end(), nvidia_momentum_[idim].begin() ); } - thrust::copy( Weight.begin(), Weight.end(), nvidia_weight_.begin() ); thrust::copy( Charge.begin(), Charge.end(), nvidia_charge_.begin() ); @@ -921,7 +844,7 @@ void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pre // Resize destination buffer (copy_if does not resize) nvidiaParticles* const dest = static_cast( buffer ); - dest->resize( nparts_to_copy ); + dest->deviceResize( nparts_to_copy ); if( nparts_to_copy ) { // Copy the particles to the destination @@ -951,7 +874,7 @@ int nvidiaParticles::addParticles( Particles* particles_to_inject ) { const auto nparts = gpu_nparts_; nvidiaParticles* to_inject = static_cast( particles_to_inject ); - resize( nparts + to_inject->gpu_nparts_ ); + deviceResize( nparts + to_inject->gpu_nparts_ ); pasteParticles( to_inject, nparts, 0 ); return to_inject->gpu_nparts_; } @@ -1009,7 +932,7 @@ void nvidiaParticles::pasteParticles( nvidiaParticles* particles_to_inject, size int nvidiaParticles::eraseLeavingParticles() { const auto nremoved = eraseParticlesByPredicate( cellKeyBelow<0>(), 0 ); - resize( gpu_nparts_ - nremoved ); + deviceResize( gpu_nparts_ - nremoved ); return nremoved; } @@ -1023,12 +946,12 @@ int nvidiaParticles::eraseParticlesByPredicate( Predicate pred, size_t offset ) // Copy the particles to the destination // Using more memory, we could use the faster remove_copy_if // NOTE: remove_if is stable. - for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { - const auto in = nvidia_double_prop_[ip]->begin(); + for( auto prop: nvidia_double_prop_ ) { + const auto in = prop->begin(); thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in + offset, in + gpu_nparts_, keys + offset, pred ); } - for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { - const auto in = nvidia_short_prop_[ip]->begin(); + for( auto prop: nvidia_short_prop_ ) { + const auto in = prop->begin(); thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in + offset, in + gpu_nparts_, keys + offset, pred ); } if( tracked ) { @@ -1049,29 +972,22 @@ void nvidiaParticles::createParticles( int n_additional_particles ) { int n_particles = gpu_nparts_; int new_size = n_particles + n_additional_particles; - for( unsigned int iprop=0 ; ipropbegin() + n_particles, prop->begin() + new_size, 0); } - - for( unsigned int iprop=0 ; ipropbegin() + n_particles, prop->begin() + new_size, 0); } - - // for( unsigned int iprop=0 ; iprop index( gpu_nparts_ ); - thrust::sequence( thrust::device, index.begin(), index.end() ); - thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); - - // Sort particles using thrust::gather, according to the sorting map - thrust::device_vector buffer( gpu_nparts_ ); - for( auto prop: nvidia_double_prop_ ) { - thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer.begin() ); - prop->swap( buffer ); - } - buffer.clear(); - thrust::device_vector buffer_short( gpu_nparts_ ); - for( auto prop: nvidia_short_prop_ ) { - thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer_short.begin() ); - prop->swap( buffer_short ); - } - buffer_short.clear(); - if( tracked ) { - thrust::device_vector buffer_uint64( gpu_nparts_ ); - thrust::gather( thrust::device, index.begin(), index.end(), nvidia_id_.begin(), buffer_uint64.begin() ); - nvidia_id_.swap( buffer_uint64 ); - buffer_uint64.clear(); - } -} +// //! Sort by cell_keys_ +// //! This version synchronizes for every vector, but uses less buffers +// void nvidiaParticles::sortParticleByKey() +// { +// // Make a sorting map using the cell keys (like numpy.argsort) +// thrust::device_vector index( gpu_nparts_ ); +// thrust::sequence( thrust::device, index.begin(), index.end() ); +// thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); +// +// // Sort particles using thrust::gather, according to the sorting map +// thrust::device_vector buffer( gpu_nparts_ ); +// for( auto prop: nvidia_double_prop_ ) { +// thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer.begin() ); +// prop->swap( buffer ); +// } +// buffer.clear(); +// thrust::device_vector buffer_short( gpu_nparts_ ); +// for( auto prop: nvidia_short_prop_ ) { +// thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer_short.begin() ); +// prop->swap( buffer_short ); +// } +// buffer_short.clear(); +// if( tracked ) { +// thrust::device_vector buffer_uint64( gpu_nparts_ ); +// thrust::gather( thrust::device, index.begin(), index.end(), nvidia_id_.begin(), buffer_uint64.begin() ); +// nvidia_id_.swap( buffer_uint64 ); +// buffer_uint64.clear(); +// } +// } //! Sort by cell_keys_ //! This version is asynchronous, but requires a buffer of equal size to be provided -void nvidiaParticles::sortParticleByKey( nvidiaParticles& buffer ) +void nvidiaParticles::sortParticleByKey() { // Make a sorting map using the cell keys (like numpy.argsort) - thrust::device_vector index( gpu_nparts_ ); + thrust::device_vector & index = uint64_buffers_[1]; thrust::sequence( thrust::device, index.begin(), index.end() ); thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); // Sort particles using thrust::gather, according to the sorting map for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { - thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_double_prop_[ip]->begin(), buffer.nvidia_double_prop_[ip]->begin() ); + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_double_prop_[ip]->begin(), double_buffers_[ip].begin() ); } for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { - thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_short_prop_[ip]->begin(), buffer.nvidia_short_prop_[ip]->begin() ); + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_short_prop_[ip]->begin(), short_buffers_[ip].begin() ); } if( tracked ) { - thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_id_.begin(), buffer.nvidia_id_.begin() ); + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_id_.begin(), uint64_buffers_[0].begin() ); } SMILEI_ACCELERATOR_DEVICE_SYNC(); - swap( buffer ); + // Swap properties with their buffer + for( int iprop = 0; iprop < nvidia_double_prop_.size(); iprop++ ) { + nvidia_double_prop_[iprop]->swap( double_buffers_[iprop] ); + } + for( int iprop = 0; iprop < nvidia_short_prop_.size(); iprop++ ) { + nvidia_short_prop_[iprop]->swap( short_buffers_[iprop] ); + } + if( tracked ) { + nvidia_id_.swap( uint64_buffers_[0] ); + } } @@ -1227,7 +1152,7 @@ void nvidiaParticles::naiveImportAndSortParticles( nvidiaParticles* particles_to // Inject newly arrived particles in particles_to_inject const size_t current_size = gpu_nparts_; - resize( current_size + particles_to_inject->size() ); + deviceResize( current_size + particles_to_inject->size() ); pasteParticles( particles_to_inject, current_size, 0 ); particles_to_inject->clear(); } diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index 37b3fc18d..19c20b70d 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -34,33 +34,20 @@ class nvidiaParticles : public Particles //! Destructor for nvidiaParticles ~nvidiaParticles(); - //! Allocate the right amount of position and momentum dimensions - void resizeDimensions( unsigned int nDim ); - //! Reserve space for (particle_count * growth_factor) particles only if //! particle_count >= deviceCapacity(). Must be called after //! allocateDimensions() - void softReserve( unsigned int particle_count, float growth_factor = 1.3F ); - - //! Reserve space for particle_count particles. Must be called after - //! allocateDimensions() - void reserve( unsigned int particle_count ); - - //! Allocate particle_count particles. Must be called after - //! allocateDimensions() - //! Set the size (deviceSize) of nvidiaParticles to particle_count. - //! - void resize( unsigned int particle_count ); + void deviceReserve( unsigned int particle_count, float growth_factor = 1.3F ); //! Assures that the memory holden by the nvidia_[position|momentum|weight| //! charge|chi|tau|cell_keys]_ is freed. This is not something you can //! achieve via a naive resize. //! The pointers in nvidia_[double|short]_prop_ are not invalidated. //! - void free(); + void deviceFree(); //! Resize Particle vectors on device - void deviceResize(unsigned int new_size); + void deviceResize( unsigned int new_size ); //! Remove all particles void deviceClear(); @@ -112,18 +99,6 @@ class nvidiaParticles : public Particles return thrust::raw_pointer_cast( nvidia_id_.data() ); }; - void swap( nvidiaParticles & p ) { - for( int iprop = 0; iprop < nvidia_double_prop_.size(); iprop++ ) { - nvidia_double_prop_[iprop]->swap( *p.nvidia_double_prop_[iprop] ); - } - for( int iprop = 0; iprop < nvidia_short_prop_.size(); iprop++ ) { - nvidia_short_prop_[iprop]->swap( *p.nvidia_short_prop_[iprop] ); - } - if( tracked ) { - nvidia_id_.swap( p.nvidia_id_ ); - } - } - // ----------------------------------------------------------------------------- //! Move leaving particles to the buffers // ----------------------------------------------------------------------------- @@ -253,6 +228,11 @@ class nvidiaParticles : public Particles //! List of short* arrays std::vector*> nvidia_short_prop_; + //! Buffers for sorting particles + std::vector> double_buffers_; + std::vector> short_buffers_; + std::vector> uint64_buffers_; + const Params* parameters_; //! We are interested in having the patch coordinates. This allows us to //! compute a bin index relative to the patch which in turns, makes the bin From d204c5b205cdb4d75206102e19d56e39e2d10a2a Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Wed, 5 Jun 2024 16:10:32 +0200 Subject: [PATCH 51/54] correct B-TIS3 implementation in 1D --- src/ElectroMagn/ElectroMagn1D.cpp | 4 ++-- src/Interpolator/Interpolator1D2Order.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ElectroMagn/ElectroMagn1D.cpp b/src/ElectroMagn/ElectroMagn1D.cpp index d90c6ee2e..a444e19f5 100755 --- a/src/ElectroMagn/ElectroMagn1D.cpp +++ b/src/ElectroMagn/ElectroMagn1D.cpp @@ -615,9 +615,9 @@ void ElectroMagn1D::centerMagneticFields() #endif for( unsigned int i=0 ; i Date: Fri, 7 Jun 2024 11:32:24 +0200 Subject: [PATCH 52/54] revert persistent buffers until more complete analysis --- src/Particles/nvidiaParticles.cu | 136 +++++++++++-------------------- src/Particles/nvidiaParticles.h | 5 -- 2 files changed, 46 insertions(+), 95 deletions(-) diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu index f84ed3463..6bc9387b4 100644 --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -321,13 +321,15 @@ namespace detail { particle_container.eraseParticlesByPredicate( cellKeyBelow<0>(), last_filled ); particle_container.deviceResize( new_count ); } - particle_to_inject.deviceFree(); // Compute keys of particles computeParticleClusterKey( particle_container, parameters, a_parent_patch ); - // Sort particles by keys - particle_container.sortParticleByKey(); + // Sort particles by keys + // using particle_to_inject as a buffer (it is swapped with particle_container after sorting) + particle_to_inject.deviceReserve( new_count ); // reserve a bit more memory for the final arrays + particle_to_inject.deviceResize( new_count ); + particle_container.sortParticleByKey( particle_to_inject ); // Recompute bin locations computeBinIndex( particle_container ); @@ -538,7 +540,7 @@ void nvidiaParticles::deviceReserve( unsigned int particle_count, float growth_f const unsigned int new_capacity = static_cast( particle_count * growth_factor ); - for( auto prop: nvidia_double_prop_) { + for( auto prop: nvidia_double_prop_ ) { prop->reserve( new_capacity ); } @@ -551,22 +553,11 @@ void nvidiaParticles::deviceReserve( unsigned int particle_count, float growth_f } nvidia_cell_keys_.reserve( new_capacity ); - - for( auto &v: double_buffers_ ) { - v.reserve( new_capacity ); - } - for( auto &v: short_buffers_ ) { - v.reserve( new_capacity ); - } - for( auto &v: uint64_buffers_ ) { - v.reserve( new_capacity ); - } - } void nvidiaParticles::deviceFree() { - for( auto prop: nvidia_double_prop_) { + for( auto prop: nvidia_double_prop_ ) { thrust::device_vector().swap( *prop ); } @@ -580,16 +571,6 @@ void nvidiaParticles::deviceFree() thrust::device_vector().swap( nvidia_cell_keys_ ); - for( auto &v: double_buffers_ ) { - thrust::device_vector().swap( v ); - } - for( auto &v: short_buffers_ ) { - thrust::device_vector().swap( v ); - } - for( auto &v: uint64_buffers_ ) { - thrust::device_vector().swap( v ); - } - gpu_nparts_ = 0; } @@ -606,18 +587,9 @@ void nvidiaParticles::deviceResize( unsigned int new_size ) if( tracked ) { nvidia_id_.resize( new_size ); } - + nvidia_cell_keys_.resize( new_size ); - for( auto &v: double_buffers_ ) { - v.resize( new_size ); - } - for( auto &v: short_buffers_ ) { - v.resize( new_size ); - } - for( auto &v: uint64_buffers_ ) { - v.resize( new_size ); - } - + gpu_nparts_ = new_size; } @@ -640,16 +612,6 @@ void nvidiaParticles::deviceClear() if( tracked ) { nvidia_id_.clear(); } - - for( auto &v: double_buffers_ ) { - v.clear(); - } - for( auto &v: short_buffers_ ) { - v.clear(); - } - for( auto &v: uint64_buffers_ ) { - v.clear(); - } gpu_nparts_ = 0; } @@ -722,12 +684,6 @@ void nvidiaParticles::initializeDataOnDevice() // setHostBinIndex(); } else { - - // Allocate buffers that are necessary for sorting particles with binning - double_buffers_.resize( nvidia_double_prop_.size() ); - short_buffers_ .resize( nvidia_short_prop_ .size() ); - uint64_buffers_.resize( 2 ); - deviceResize( gpu_nparts_ ); // resizes the buffers // At this point, a copy of the host particles and last_index is on the // device and we know we support the space dimension. @@ -1006,66 +962,66 @@ void nvidiaParticles::importAndSortParticles( Particles* particles_to_inject ) setHostBinIndex(); } -// //! Sort by cell_keys_ -// //! This version synchronizes for every vector, but uses less buffers -// void nvidiaParticles::sortParticleByKey() -// { -// // Make a sorting map using the cell keys (like numpy.argsort) -// thrust::device_vector index( gpu_nparts_ ); -// thrust::sequence( thrust::device, index.begin(), index.end() ); -// thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); -// -// // Sort particles using thrust::gather, according to the sorting map -// thrust::device_vector buffer( gpu_nparts_ ); -// for( auto prop: nvidia_double_prop_ ) { -// thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer.begin() ); -// prop->swap( buffer ); -// } -// buffer.clear(); -// thrust::device_vector buffer_short( gpu_nparts_ ); -// for( auto prop: nvidia_short_prop_ ) { -// thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer_short.begin() ); -// prop->swap( buffer_short ); -// } -// buffer_short.clear(); -// if( tracked ) { -// thrust::device_vector buffer_uint64( gpu_nparts_ ); -// thrust::gather( thrust::device, index.begin(), index.end(), nvidia_id_.begin(), buffer_uint64.begin() ); -// nvidia_id_.swap( buffer_uint64 ); -// buffer_uint64.clear(); -// } -// } +//! Sort by cell_keys_ +//! This version synchronizes for every vector, but uses less buffers +void nvidiaParticles::sortParticleByKey() +{ + // Make a sorting map using the cell keys (like numpy.argsort) + thrust::device_vector index( gpu_nparts_ ); + thrust::sequence( thrust::device, index.begin(), index.end() ); + thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); + + // Sort particles using thrust::gather, according to the sorting map + thrust::device_vector buffer( gpu_nparts_ ); + for( auto prop: nvidia_double_prop_ ) { + thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer.begin() ); + prop->swap( buffer ); + } + buffer.clear(); + thrust::device_vector buffer_short( gpu_nparts_ ); + for( auto prop: nvidia_short_prop_ ) { + thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer_short.begin() ); + prop->swap( buffer_short ); + } + buffer_short.clear(); + if( tracked ) { + thrust::device_vector buffer_uint64( gpu_nparts_ ); + thrust::gather( thrust::device, index.begin(), index.end(), nvidia_id_.begin(), buffer_uint64.begin() ); + nvidia_id_.swap( buffer_uint64 ); + buffer_uint64.clear(); + } +} //! Sort by cell_keys_ //! This version is asynchronous, but requires a buffer of equal size to be provided -void nvidiaParticles::sortParticleByKey() +void nvidiaParticles::sortParticleByKey( nvidiaParticles &buffer ) { // Make a sorting map using the cell keys (like numpy.argsort) - thrust::device_vector & index = uint64_buffers_[1]; + thrust::device_vector index( gpu_nparts_ ); thrust::sequence( thrust::device, index.begin(), index.end() ); thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); // Sort particles using thrust::gather, according to the sorting map for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { - thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_double_prop_[ip]->begin(), double_buffers_[ip].begin() ); + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_double_prop_[ip]->begin(), buffer.nvidia_double_prop_[ip]->begin() ); } for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { - thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_short_prop_[ip]->begin(), short_buffers_[ip].begin() ); + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_short_prop_[ip]->begin(), buffer.nvidia_short_prop_[ip]->begin() ); } if( tracked ) { - thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_id_.begin(), uint64_buffers_[0].begin() ); + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_id_.begin(), buffer.nvidia_id_.begin() ); } SMILEI_ACCELERATOR_DEVICE_SYNC(); // Swap properties with their buffer for( int iprop = 0; iprop < nvidia_double_prop_.size(); iprop++ ) { - nvidia_double_prop_[iprop]->swap( double_buffers_[iprop] ); + nvidia_double_prop_[iprop]->swap( *buffer.nvidia_double_prop_[iprop] ); } for( int iprop = 0; iprop < nvidia_short_prop_.size(); iprop++ ) { - nvidia_short_prop_[iprop]->swap( short_buffers_[iprop] ); + nvidia_short_prop_[iprop]->swap( *buffer.nvidia_short_prop_[iprop] ); } if( tracked ) { - nvidia_id_.swap( uint64_buffers_[0] ); + nvidia_id_.swap( buffer.nvidia_id_ ); } } diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index 19c20b70d..a02edffc8 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -228,11 +228,6 @@ class nvidiaParticles : public Particles //! List of short* arrays std::vector*> nvidia_short_prop_; - //! Buffers for sorting particles - std::vector> double_buffers_; - std::vector> short_buffers_; - std::vector> uint64_buffers_; - const Params* parameters_; //! We are interested in having the patch coordinates. This allows us to //! compute a bin index relative to the patch which in turns, makes the bin From b033879e56499264b6104f0d3bd3f4d1f5b5021f Mon Sep 17 00:00:00 2001 From: Francesco Massimo Date: Mon, 24 Jun 2024 11:19:53 +0000 Subject: [PATCH 53/54] Adding new publications --- doc/Sphinx/Overview/material.rst | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/Sphinx/Overview/material.rst b/doc/Sphinx/Overview/material.rst index 66ed26180..5712bfeae 100644 --- a/doc/Sphinx/Overview/material.rst +++ b/doc/Sphinx/Overview/material.rst @@ -30,7 +30,7 @@ Papers involving Smilei ^^^^^^^^^^^^^^^^^^^^^^^^ Only papers published in peer-reviewed journals are listed (for the complete list of citing papers see `Google Scholar `_). -As of May 2024, 189 papers have been published covering a broad range of topics: +As of May 2024, at least 192 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -50,6 +50,25 @@ Following is the distribution of these topics in the listed publications up to N Use the python script doc/doi2publications.py to generate entries from a DOI number, and paste them here You can count the number of papers in the list with the vim command :%s/.. \[//gn. + +.. [Sikorski2024] + + P. Sikorski, A. G. R. Thomas, S. S. Bulanov, M. Zepf and D. Seipt, + `Novel signatures of radiation reaction in electron–laser sidescattering`, + `New Journal of Physics 26 063011 (2024) `_ + +.. [Ivanov2024b] + + K. A. Ivanov, S. A. Shulyapov, D. A. Gorlova, I. P. Tsygvintsev, M. S. Krivokorytov, I. N. Tsymbalov, R. V. Volkov and A. B. Savelev, + `Laser-accelerated MeV-scale collimated electron bunch from a near-critical plasma of a liquid jet target`, + `Laser Physics Letters 21, 7 (2024) `_ + +.. [Malik2024] + + H. K. Malik, S. Kumar, and D. K. Singh, + `Effect of trapezoidal plasma density region in bubble wakefield acceleration`, + `Physica Scripta 99, 075601 (2024) `_ + .. [Krafft2024b] C. Krafft, P. Savoini, and F. J. Polanco-Rodríguez, @@ -62,7 +81,7 @@ Following is the distribution of these topics in the listed publications up to N `All-optical source size and emittance measurements of laser-accelerated electron beams`, `Physical Review Accelerators and Beams 27, 052803 (2024) `_ -.. [Ivanov2024] +.. [Ivanov2024a] K. A. Ivanov, D. A. Gorlova, I. N. Tsymbalov, I. P. Tsygvintsev, S. A. Shulyapov, R. V. Volkov, and A. B. Savel’ev, `Laser-driven pointed acceleration of electrons with preformed plasma lens`, From 438d43d8ab727d02dd8f4402c50393343e871b99 Mon Sep 17 00:00:00 2001 From: Frederic Perez Date: Mon, 24 Jun 2024 13:32:13 +0200 Subject: [PATCH 54/54] prepare v5.1 --- doc/Sphinx/Overview/releases.rst | 58 ++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/doc/Sphinx/Overview/releases.rst b/doc/Sphinx/Overview/releases.rst index 5c3e9d046..e271b32c5 100755 --- a/doc/Sphinx/Overview/releases.rst +++ b/doc/Sphinx/Overview/releases.rst @@ -16,18 +16,43 @@ Get Smilei You can find older, `unsupported versions here `_ +.. +.. ---- + +.. .. _latestVersion: + +.. Changes made in the repository (not released) +.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ---- -.. _latestVersion: +Ongoing projects +^^^^^^^^^^^^^^^^ + +* Already available, but experimental: + + * Particle merging + * Nuclear reactions + * Perfectly Matched Layers + * NewParticles diagnostic + +* In preparation: + + * Spectral solvers + + +---- -Changes made in the repository (not released) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Release 5.1 +^^^^^^^^^^^^^^^^^^^^^ -* GPU: +* **GPU**: + * ``1Dcartesian`` geometry now available. * Compilation simplified and better documented. + * Improved performance of particle sorting. -* Features: +* **Features**: * Relativistic field initialization now supports multiple species and both direction propagations. * Added the argument ``phase_offset`` in laser definitions such as ``LaserGaussian2D``. @@ -37,7 +62,7 @@ Changes made in the repository (not released) * Deprecated ``smilei_rand_max``. * New namelist variables ``smilei_omp_threads`` and ``smilei_total_cores``. -* Happi: +* **Happi**: * In ``Scalar``, it is now possible to make an operation on scalars such as ``"Uelm+Ukin"``. * The list of available scalars can be obtained from ``getScalars()``. @@ -46,11 +71,11 @@ Changes made in the repository (not released) * Changed coordinate reference for 2D probe in 3D or AM geometry (zero is the box origin projected orthogonally on the probe plane). -* Documentation: +* **Documentation**: * Dark theme (click the switch on the bottom left, or set browser preferences). -* Bug fixes: +* **Bug fixes** : * ``dump_minutes`` often failed to write some checkpoint files. * ``"auto"`` limits in ``ParticleBinning`` could fail with only one side on ``"auto"``. @@ -58,23 +83,6 @@ Changes made in the repository (not released) ---- -Projects -^^^^^^^^^^^^^^^^ - -* Already available, but experimental: - - * Particle merging - * Nuclear reactions - * Perfectly Matched Layers - * NewParticles diagnostic - -* In preparation: - - * Spectral solvers - - ----- - Release 5.0 ^^^^^^^^^^^^^^^^^^^^^