xgc1/html/toroidal__average_8hpp_source.html

 #ifndef TOROIDAL_AVERAGE_HPP

 #define TOROIDAL_AVERAGE_HPP


 #include "my_mirror_view.hpp"

 #include "domain_decomposition.hpp"


 // Does a copy to the MPI memory space if different from the View's memory space

 // Optional DESTINATION_RANK: Do MPI_Reduce rather than MPI_Allreduce

 template<class T>

 void toroidal_sum_in_place(const DomainDecomposition<DeviceType>& pol_decomp, T& f, int DESTINATION_RANK=-1){

 #ifdef USE_MPI

     // No operation if n_planes is 1

     if(pol_decomp.mpi.n_intpl_ranks==1) return;


     // Cast to 1D with an unmanaged view for a generic element-wise operation

     View<double*,CLayout, typename T::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(f.data()), f.size());


     using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here


     // Copy to host if on GPU and GPU-aware MPI is not available

     auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIHostType());

     auto f_mpi = my_mirror_view(f_1D, MPIHostType(), f_mpi_d);

     mirror_copy(f_mpi, f_1D);


     /*** Sum all planes ***/

     bool send_result_to_all_ranks = (DESTINATION_RANK==-1);

     if(send_result_to_all_ranks){

         MPI_Allreduce(MPI_IN_PLACE, f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, pol_decomp.mpi.intpl_comm);

     }else{

         if(pol_decomp.mpi.my_intpl_rank==DESTINATION_RANK){

             MPI_Reduce(MPI_IN_PLACE, f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, DESTINATION_RANK, pol_decomp.mpi.intpl_comm);

         }else{

             MPI_Reduce(f_mpi.data(), f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, DESTINATION_RANK, pol_decomp.mpi.intpl_comm);

         }

     }


     // Copy back

     if(send_result_to_all_ranks || pol_decomp.mpi.my_intpl_rank==DESTINATION_RANK){

         mirror_copy(f_1D, f_mpi);

     }

 #endif

 }


 // Does a copy to the MPI memory space if different from the View's memory space

 template<class T>

 void toroidal_average_in_place(const DomainDecomposition<DeviceType>& pol_decomp, T& f){

 #ifdef USE_MPI

     // No operation if n_planes is 1

     if(pol_decomp.mpi.n_intpl_ranks==1) return;


     // Cast to 1D with an unmanaged view for a generic element-wise operation

     View<double*,CLayout, typename T::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(f.data()), f.size());


     using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here


     // Copy to host if on GPU and GPU-aware MPI is not available

     auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIHostType());

     auto f_mpi = my_mirror_view(f_1D, MPIHostType(), f_mpi_d);

     mirror_copy(f_mpi, f_1D);


     /*** Sum all planes ***/

     MPI_Allreduce(MPI_IN_PLACE, f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, pol_decomp.mpi.intpl_comm);


     // Copy back

     mirror_copy(f_1D, f_mpi);


     /*** Normalize by n planes ***/

     // Invert first so that the loop has a multiplication rather than a division

     double inv_n_planes = 1.0/pol_decomp.mpi.n_intpl_ranks;


     // Do the normalization

     Kokkos::parallel_for("toroidal_average", Kokkos::RangePolicy<typename T::execution_space>(0, f.size()), KOKKOS_LAMBDA(const int i){

         f_1D(i) *= inv_n_planes;

     });

     Kokkos::fence();

 #endif

 }


 template<class T>

 View<double*,CLayout, typename T::device_type> split_toroidal_average(const DomainDecomposition<DeviceType>& pol_decomp, T& f){

     // Cast to 1D with an unmanaged view for a generic element-wise operation

     View<double*,CLayout, typename T::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(f.data()), f.size());


     // Initialize temporary memory to hold the toroidal average

     View<double*,CLayout, typename T::device_type> toroidal_avg(NoInit("toroidal_avg"), f_1D.layout());

     Kokkos::deep_copy(toroidal_avg, f_1D);


     // Calculate the average

     toroidal_average_in_place(pol_decomp, toroidal_avg);


     // Remove the average from f

     Kokkos::parallel_for("toroidal_average", Kokkos::RangePolicy<typename T::execution_space>(0, f.size()), KOKKOS_LAMBDA(const int i){

         f_1D(i) -= toroidal_avg(i);

     });

     Kokkos::fence();


     return toroidal_avg;

 }


 template<class T>

 void remove_toroidal_average(const DomainDecomposition<DeviceType>& pol_decomp, T& f){

     auto toroidal_avg = split_toroidal_average(pol_decomp, f);

 }


 #endif

DomainDecomposition< DeviceType >

domain_decomposition.hpp

my_mirror_view.hpp

mirror_copy
void mirror_copy(T1 &view_dest, const T2 &view_src)
Definition: my_mirror_view.hpp:122

my_mirror_scratch_view
View< T *, CLayout, Device > my_mirror_scratch_view(const View< T *, CLayout, Device, Kokkos::MemoryTraits< Kokkos::Unmanaged >> &view, Device nd)
Definition: my_mirror_view.hpp:97

my_mirror_view
View< T *, CLayout, Device > my_mirror_view(const View< T *, CLayout, Device > &view, Device nd)
Definition: my_mirror_view.hpp:14

Particles::deep_copy
void deep_copy(const Array< DataType, Device > &dest, const Array< DataType, Device2 > &src)
Definition: particles.hpp:310

Streamed::parallel_for
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252

HostType
Kokkos::Device< HostExSpace, HostMemSpace > HostType
Definition: space_settings.hpp:57

NoInit
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:69

toroidal_sum_in_place
void toroidal_sum_in_place(const DomainDecomposition< DeviceType > &pol_decomp, T &f, int DESTINATION_RANK=-1)
Definition: toroidal_average.hpp:10

toroidal_average_in_place
void toroidal_average_in_place(const DomainDecomposition< DeviceType > &pol_decomp, T &f)
Definition: toroidal_average.hpp:46

remove_toroidal_average
void remove_toroidal_average(const DomainDecomposition< DeviceType > &pol_decomp, T &f)
Definition: toroidal_average.hpp:101

split_toroidal_average
View< double *, CLayout, typename T::device_type > split_toroidal_average(const DomainDecomposition< DeviceType > &pol_decomp, T &f)
Definition: toroidal_average.hpp:80