xgca/html/poloidal__sum_8hpp_source.html

 #ifndef POLOIDAL_SUM_HPP

 #define POLOIDAL_SUM_HPP


 #include "domain_decomposition.hpp"

 #include "my_mirror_view.hpp"

 #include "timer_macro.hpp"


 // Does a copy to the MPI memory space if different from the View's memory space

 template<class T>

 void poloidal_sum_in_place(const DomainDecomposition<DeviceType>& pol_decomp, T& f){

     // No operation if n_planes is 1

     if(pol_decomp.mpi.n_plane_ranks==1) return;


     // Cast to 1D with an unmanaged view for a generic element-wise operation

     View<double*,CLayout, typename T::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(f.data()), f.size());


     using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here


     // Copy to host if on GPU and GPU-aware MPI is not available

     auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIHostType());

     auto f_mpi = my_mirror_view(f_1D, MPIHostType(), f_mpi_d);

     mirror_copy(f_mpi, f_1D);


     /*** Sum all planes ***/

     MPI_Allreduce(MPI_IN_PLACE, f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, pol_decomp.mpi.plane_comm);


     // Copy back

     mirror_copy(f_1D, f_mpi);


     Kokkos::fence();

 }


 // Broadcast view from root rank of plane to rest of the ranks on the plane

 template<class Device>

 void poloidal_bcast(const DomainDecomposition<DeviceType>& pol_decomp, const View<double*, CLayout, Device>& full_view, int ROOT_RANK){

     // No operation if n_planes is 1

     if(pol_decomp.mpi.n_plane_ranks==1) return;


     // Copy to MPIDeviceType space if different from Device

     GPTLstart("POL_BCAST_MIRROR");

     auto f_mpi = my_mirror_view(full_view, MPIDeviceType());

     if(pol_decomp.mpi.my_plane_rank==ROOT_RANK) mirror_copy(f_mpi, full_view);

     GPTLstop("POL_BCAST_MIRROR");


     // Broadcast

     GPTLstart("POL_BCAST_MPI");

     MPI_Bcast(f_mpi.data(),f_mpi.size(),MPI_DOUBLE,ROOT_RANK,pol_decomp.mpi.plane_comm);

     GPTLstop("POL_BCAST_MPI");


     // Copy back

     GPTLstart("POL_BCAST_MIRRORBACK");

     mirror_copy(full_view, f_mpi);

     GPTLstop("POL_BCAST_MIRRORBACK");

 }


 // Gather view of size grid.nnode

 template<class Device>

 void poloidal_gather(const DomainDecomposition<DeviceType>& pol_decomp, const View<double*, CLayout, Device>& full_view){

     // No operation if n_planes is 1

     if(pol_decomp.mpi.n_plane_ranks==1) return;


     GPTLstart("POL_GATHER_SETUP");

     int nv = 1; // values per node

     auto mpi_plan = pol_decomp.mpi_distribution_plan(nv);

     GPTLstop("POL_GATHER_SETUP");


     //using MPIHostType = HostType; // Do this MPI_Allgatherv on Host for now, should try with GPU-aware MPI


     GPTLstart("POL_GATHER_MIRROR");

     auto f_mpi = my_mirror_view(full_view, MPIDeviceType());

     mirror_copy(f_mpi, full_view);


     View<double*,CLayout, MPIDeviceType> f_mpi_in(NoInit("f_mpi_in"), mpi_plan.my_count());

     View<double*,CLayout, MPIDeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_mpi_unm(f_mpi.data() + mpi_plan.my_displ(), mpi_plan.my_count());

     Kokkos::deep_copy(f_mpi_in, f_mpi_unm);

     GPTLstop("POL_GATHER_MIRROR");


     GPTLstart("POL_GATHER_MPI");

     //MPI_Allgatherv(MPI_IN_PLACE, mpi_plan.my_count(), MPI_DOUBLE, f_mpi.data(), mpi_plan.cnts.data(), mpi_plan.displs.data(), MPI_DOUBLE, pol_decomp.mpi.plane_comm);


     // Alternative algorithm, seems faster though it shouldn't be

     MPI_Gatherv(f_mpi_in.data(), mpi_plan.my_count(), MPI_DOUBLE, f_mpi.data(), mpi_plan.cnts.data(), mpi_plan.displs.data(), MPI_DOUBLE, 0, pol_decomp.mpi.plane_comm);

     MPI_Bcast(f_mpi.data(),f_mpi.size(),MPI_DOUBLE,0,pol_decomp.mpi.plane_comm);


     GPTLstop("POL_GATHER_MPI");


     GPTLstart("POL_GATHER_MIRRORBACK");

     // Copy back

     mirror_copy(full_view, f_mpi);

     GPTLstop("POL_GATHER_MIRRORBACK");

 }


 template<class Device>

 void poloidal_gather(const DomainDecomposition<DeviceType>& pol_decomp, const View<double***, CLayout, Device>& full_view){

     // No operation if n_planes is 1

     if(pol_decomp.mpi.n_plane_ranks==1) return;


     GPTLstart("POL_GATHER_SETUP");

     View<int*,CLayout,HostType> cnts(NoInit("cnts"), pol_decomp.mpi.n_plane_ranks);

     View<int*,CLayout,HostType> displs(NoInit("displs"), pol_decomp.mpi.n_plane_ranks);


     // Create cnts array with # vertex nodes on each processor

     int nv = full_view.extent(0)*full_view.extent(2); // nvr*nvz

     for(int i = 0; i<cnts.size(); i++){

         displs(i) = nv*(pol_decomp.gvid0_pid_h(i) - 1); // gvid0_pid_h is 1-indexed

         cnts(i)   = nv*(pol_decomp.gvid0_pid_h(i+1) - pol_decomp.gvid0_pid_h(i));

     }


     // Note: These are currently not the same as pol_decomp.node_offset and pol_decomp.nnodes. That's confusing! ALS

     int my_node_send_count = pol_decomp.gvid0_pid_h(pol_decomp.mpi.my_plane_rank+1) - pol_decomp.gvid0_pid_h(pol_decomp.mpi.my_plane_rank);

     int my_send_count = my_node_send_count*nv;

     int my_node_offset = pol_decomp.gvid0_pid_h(pol_decomp.mpi.my_plane_rank) - 1;


     GPTLstop("POL_GATHER_SETUP");


     // Need to transpose first; do this in a temporary array

     GPTLstart("POL_GATHER_ALLOC1");

     View<double***, CLayout, Device> tmp(NoInit("tmp"), my_node_send_count, full_view.extent(0),full_view.extent(2));

     GPTLstop("POL_GATHER_ALLOC1");


     GPTLstart("POL_GATHER_TRANSP1");

     Kokkos::parallel_for("transpose", Kokkos::RangePolicy<typename Device::execution_space>( 0, tmp.extent(0)), KOKKOS_LAMBDA(const int inode){

         for(int imu = 0; imu < tmp.extent(1); imu++){

             for(int ivp = 0; ivp < tmp.extent(2); ivp++){

                 tmp(inode, imu, ivp) = full_view(imu, inode+my_node_offset, ivp);

             }

         }

     });

     Kokkos::fence();

     GPTLstop("POL_GATHER_TRANSP1");


     //using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here


     GPTLstart("POL_GATHER_MIRROR");

     // Cast to 1D with an unmanaged view for a generic element-wise operation, then copy to host if on GPU and GPU-aware MPI is not available

     View<double*,CLayout, Device, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(tmp.data()), tmp.size());

     auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIDeviceType());

     auto f_mpi = my_mirror_view(f_1D, MPIDeviceType(), f_mpi_d);

     mirror_copy(f_mpi, f_1D);


     // Now do the same for the output view, copying into a temporary view for the transpose

     View<double***, CLayout, Device> full_tmp(NoInit("full_tmp"), full_view.extent(1), full_view.extent(0), full_view.extent(2));

     View<double*,CLayout, Device, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D_out((double*)(full_tmp.data()), full_tmp.size());

     auto f_mpi_d_out = my_mirror_scratch_view(f_1D_out,MPIDeviceType());

     auto f_mpi_out = my_mirror_view(f_1D_out, MPIDeviceType(), f_mpi_d_out);

     // No copy since output view will be overwritten

     GPTLstop("POL_GATHER_MIRROR");


     GPTLstart("POL_GATHER_MPI");

     MPI_Allgatherv(f_mpi.data(), my_send_count, MPI_DOUBLE, f_mpi_out.data(), cnts.data(), displs.data(), MPI_DOUBLE, pol_decomp.mpi.plane_comm);

     GPTLstop("POL_GATHER_MPI");


     GPTLstart("POL_GATHER_MIRRORBACK");

     // Copy back

     mirror_copy(f_1D_out, f_mpi_out);

     GPTLstop("POL_GATHER_MIRRORBACK");


     // Transpose back into full_view

     GPTLstart("POL_GATHER_TRANSP2");

     Kokkos::parallel_for("transpose_back", Kokkos::RangePolicy<ExSpace>( 0, full_tmp.extent(0)), KOKKOS_LAMBDA(const int inode){

         for(int imu = 0; imu < full_tmp.extent(1); imu++){

             for(int ivp = 0; ivp < full_tmp.extent(2); ivp++){

                 full_view(imu, inode, ivp) = full_tmp(inode, imu, ivp);

             }

         }

     });

     Kokkos::fence();

     GPTLstop("POL_GATHER_TRANSP2");

 }


 #endif

DomainDecomposition< DeviceType >

DomainDecomposition::mpi_distribution_plan
DistributionPlan mpi_distribution_plan(int nv) const

DomainDecomposition::gvid0_pid_h
Kokkos::View< int *, Kokkos::LayoutRight, HostType > gvid0_pid_h
Which processors get which vertices (host)
Definition: domain_decomposition.hpp:95

domain_decomposition.hpp

my_mirror_view.hpp

mirror_copy
void mirror_copy(T1 &view_dest, const T2 &view_src)
Definition: my_mirror_view.hpp:122

my_mirror_scratch_view
View< T *, CLayout, Device > my_mirror_scratch_view(const View< T *, CLayout, Device, Kokkos::MemoryTraits< Kokkos::Unmanaged >> &view, Device nd)
Definition: my_mirror_view.hpp:97

my_mirror_view
View< T *, CLayout, Device > my_mirror_view(const View< T *, CLayout, Device > &view, Device nd)
Definition: my_mirror_view.hpp:14

Streamed::parallel_for
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252

poloidal_gather
void poloidal_gather(const DomainDecomposition< DeviceType > &pol_decomp, const View< double *, CLayout, Device > &full_view)
Definition: poloidal_sum.hpp:58

poloidal_bcast
void poloidal_bcast(const DomainDecomposition< DeviceType > &pol_decomp, const View< double *, CLayout, Device > &full_view, int ROOT_RANK)
Definition: poloidal_sum.hpp:35

poloidal_sum_in_place
void poloidal_sum_in_place(const DomainDecomposition< DeviceType > &pol_decomp, T &f)
Definition: poloidal_sum.hpp:10

HostType
Kokkos::Device< HostExSpace, HostMemSpace > HostType
Definition: space_settings.hpp:57

MPIDeviceType
HostType MPIDeviceType
Definition: space_settings.hpp:63

NoInit
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:69

timer_macro.hpp

GPTLstart
static int GPTLstart(const char *name)
Definition: timer_macro.hpp:9

GPTLstop
static int GPTLstop(const char *name)
Definition: timer_macro.hpp:10