XGC1
toroidal_average.hpp
Go to the documentation of this file.
1 #ifndef TOROIDAL_AVERAGE_HPP
2 #define TOROIDAL_AVERAGE_HPP
3 
4 #include "my_mirror_view.hpp"
6 
7 // Does a copy to the MPI memory space if different from the View's memory space
8 // Optional DESTINATION_RANK: Do MPI_Reduce rather than MPI_Allreduce
9 template<class T>
10 void toroidal_sum_in_place(const DomainDecomposition<DeviceType>& pol_decomp, T& f, int DESTINATION_RANK=-1){
11 #ifdef USE_MPI
12  // No operation if n_planes is 1
13  if(pol_decomp.mpi.n_intpl_ranks==1) return;
14 
15  // Cast to 1D with an unmanaged view for a generic element-wise operation
16  View<double*,CLayout, typename T::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(f.data()), f.size());
17 
18  using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here
19 
20  // Copy to host if on GPU and GPU-aware MPI is not available
21  auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIHostType());
22  auto f_mpi = my_mirror_view(f_1D, MPIHostType(), f_mpi_d);
23  mirror_copy(f_mpi, f_1D);
24 
25  /*** Sum all planes ***/
26  bool send_result_to_all_ranks = (DESTINATION_RANK==-1);
27  if(send_result_to_all_ranks){
28  MPI_Allreduce(MPI_IN_PLACE, f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, pol_decomp.mpi.intpl_comm);
29  }else{
30  if(pol_decomp.mpi.my_intpl_rank==DESTINATION_RANK){
31  MPI_Reduce(MPI_IN_PLACE, f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, DESTINATION_RANK, pol_decomp.mpi.intpl_comm);
32  }else{
33  MPI_Reduce(f_mpi.data(), f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, DESTINATION_RANK, pol_decomp.mpi.intpl_comm);
34  }
35  }
36 
37  // Copy back
38  if(send_result_to_all_ranks || pol_decomp.mpi.my_intpl_rank==DESTINATION_RANK){
39  mirror_copy(f_1D, f_mpi);
40  }
41 #endif
42 }
43 
44 // Does a copy to the MPI memory space if different from the View's memory space
45 template<class T>
47 #ifdef USE_MPI
48  // No operation if n_planes is 1
49  if(pol_decomp.mpi.n_intpl_ranks==1) return;
50 
51  // Cast to 1D with an unmanaged view for a generic element-wise operation
52  View<double*,CLayout, typename T::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(f.data()), f.size());
53 
54  using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here
55 
56  // Copy to host if on GPU and GPU-aware MPI is not available
57  auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIHostType());
58  auto f_mpi = my_mirror_view(f_1D, MPIHostType(), f_mpi_d);
59  mirror_copy(f_mpi, f_1D);
60 
61  /*** Sum all planes ***/
62  MPI_Allreduce(MPI_IN_PLACE, f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, pol_decomp.mpi.intpl_comm);
63 
64  // Copy back
65  mirror_copy(f_1D, f_mpi);
66 
67  /*** Normalize by n planes ***/
68  // Invert first so that the loop has a multiplication rather than a division
69  double inv_n_planes = 1.0/pol_decomp.mpi.n_intpl_ranks;
70 
71  // Do the normalization
72  Kokkos::parallel_for("toroidal_average", Kokkos::RangePolicy<typename T::execution_space>(0, f.size()), KOKKOS_LAMBDA(const int i){
73  f_1D(i) *= inv_n_planes;
74  });
75  Kokkos::fence();
76 #endif
77 }
78 
79 template<class T>
80 View<double*,CLayout, typename T::device_type> split_toroidal_average(const DomainDecomposition<DeviceType>& pol_decomp, T& f){
81  // Cast to 1D with an unmanaged view for a generic element-wise operation
82  View<double*,CLayout, typename T::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(f.data()), f.size());
83 
84  // Initialize temporary memory to hold the toroidal average
85  View<double*,CLayout, typename T::device_type> toroidal_avg(NoInit("toroidal_avg"), f_1D.layout());
86  Kokkos::deep_copy(toroidal_avg, f_1D);
87 
88  // Calculate the average
89  toroidal_average_in_place(pol_decomp, toroidal_avg);
90 
91  // Remove the average from f
92  Kokkos::parallel_for("toroidal_average", Kokkos::RangePolicy<typename T::execution_space>(0, f.size()), KOKKOS_LAMBDA(const int i){
93  f_1D(i) -= toroidal_avg(i);
94  });
95  Kokkos::fence();
96 
97  return toroidal_avg;
98 }
99 
100 template<class T>
102  auto toroidal_avg = split_toroidal_average(pol_decomp, f);
103 }
104 
105 #endif
void mirror_copy(T1 &view_dest, const T2 &view_src)
Definition: my_mirror_view.hpp:122
View< T *, CLayout, Device > my_mirror_scratch_view(const View< T *, CLayout, Device, Kokkos::MemoryTraits< Kokkos::Unmanaged >> &view, Device nd)
Definition: my_mirror_view.hpp:97
View< T *, CLayout, Device > my_mirror_view(const View< T *, CLayout, Device > &view, Device nd)
Definition: my_mirror_view.hpp:14
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252
Kokkos::Device< HostExSpace, HostMemSpace > HostType
Definition: space_settings.hpp:57
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:69
void toroidal_sum_in_place(const DomainDecomposition< DeviceType > &pol_decomp, T &f, int DESTINATION_RANK=-1)
Definition: toroidal_average.hpp:10
void toroidal_average_in_place(const DomainDecomposition< DeviceType > &pol_decomp, T &f)
Definition: toroidal_average.hpp:46
void remove_toroidal_average(const DomainDecomposition< DeviceType > &pol_decomp, T &f)
Definition: toroidal_average.hpp:101
View< double *, CLayout, typename T::device_type > split_toroidal_average(const DomainDecomposition< DeviceType > &pol_decomp, T &f)
Definition: toroidal_average.hpp:80