XGCa
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
poloidal_sum.hpp
Go to the documentation of this file.
1 #ifndef POLOIDAL_SUM_HPP
2 #define POLOIDAL_SUM_HPP
3 
5 #include "my_mirror_view.hpp"
6 #include "timer_macro.hpp"
7 
8 // Does a copy to the MPI memory space if different from the View's memory space
9 template<class T>
11  // No operation if n_planes is 1
12  if(pol_decomp.mpi.n_plane_ranks==1) return;
13 
14  // Cast to 1D with an unmanaged view for a generic element-wise operation
15  View<double*,CLayout, typename T::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(f.data()), f.size());
16 
17  using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here
18 
19  // Copy to host if on GPU and GPU-aware MPI is not available
20  auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIHostType());
21  auto f_mpi = my_mirror_view(f_1D, MPIHostType(), f_mpi_d);
22  mirror_copy(f_mpi, f_1D);
23 
24  /*** Sum all planes ***/
25  MPI_Allreduce(MPI_IN_PLACE, f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, pol_decomp.mpi.plane_comm);
26 
27  // Copy back
28  mirror_copy(f_1D, f_mpi);
29 
30  Kokkos::fence();
31 }
32 
33 // Broadcast view from root rank of plane to rest of the ranks on the plane
34 template<class Device>
35 void poloidal_bcast(const DomainDecomposition<DeviceType>& pol_decomp, const View<double*, CLayout, Device>& full_view, int ROOT_RANK){
36  // No operation if n_planes is 1
37  if(pol_decomp.mpi.n_plane_ranks==1) return;
38 
39  // Copy to MPIDeviceType space if different from Device
40  GPTLstart("POL_BCAST_MIRROR");
41  auto f_mpi = my_mirror_view(full_view, MPIDeviceType());
42  if(pol_decomp.mpi.my_plane_rank==ROOT_RANK) mirror_copy(f_mpi, full_view);
43  GPTLstop("POL_BCAST_MIRROR");
44 
45  // Broadcast
46  GPTLstart("POL_BCAST_MPI");
47  MPI_Bcast(f_mpi.data(),f_mpi.size(),MPI_DOUBLE,ROOT_RANK,pol_decomp.mpi.plane_comm);
48  GPTLstop("POL_BCAST_MPI");
49 
50  // Copy back
51  GPTLstart("POL_BCAST_MIRRORBACK");
52  mirror_copy(full_view, f_mpi);
53  GPTLstop("POL_BCAST_MIRRORBACK");
54 }
55 
56 // Gather view of size grid.nnode
57 template<class Device>
58 void poloidal_gather(const DomainDecomposition<DeviceType>& pol_decomp, const View<double*, CLayout, Device>& full_view){
59  // No operation if n_planes is 1
60  if(pol_decomp.mpi.n_plane_ranks==1) return;
61 
62  GPTLstart("POL_GATHER_SETUP");
63  int nv = 1; // values per node
64  auto mpi_plan = pol_decomp.mpi_distribution_plan(nv);
65  GPTLstop("POL_GATHER_SETUP");
66 
67  //using MPIHostType = HostType; // Do this MPI_Allgatherv on Host for now, should try with GPU-aware MPI
68 
69  GPTLstart("POL_GATHER_MIRROR");
70  auto f_mpi = my_mirror_view(full_view, MPIDeviceType());
71  mirror_copy(f_mpi, full_view);
72 
73  View<double*,CLayout, MPIDeviceType> f_mpi_in(NoInit("f_mpi_in"), mpi_plan.my_count());
74  View<double*,CLayout, MPIDeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_mpi_unm(f_mpi.data() + mpi_plan.my_displ(), mpi_plan.my_count());
75  Kokkos::deep_copy(f_mpi_in, f_mpi_unm);
76  GPTLstop("POL_GATHER_MIRROR");
77 
78  GPTLstart("POL_GATHER_MPI");
79  //MPI_Allgatherv(MPI_IN_PLACE, mpi_plan.my_count(), MPI_DOUBLE, f_mpi.data(), mpi_plan.cnts.data(), mpi_plan.displs.data(), MPI_DOUBLE, pol_decomp.mpi.plane_comm);
80 
81  // Alternative algorithm, seems faster though it shouldn't be
82  MPI_Gatherv(f_mpi_in.data(), mpi_plan.my_count(), MPI_DOUBLE, f_mpi.data(), mpi_plan.cnts.data(), mpi_plan.displs.data(), MPI_DOUBLE, 0, pol_decomp.mpi.plane_comm);
83  MPI_Bcast(f_mpi.data(),f_mpi.size(),MPI_DOUBLE,0,pol_decomp.mpi.plane_comm);
84 
85  GPTLstop("POL_GATHER_MPI");
86 
87  GPTLstart("POL_GATHER_MIRRORBACK");
88  // Copy back
89  mirror_copy(full_view, f_mpi);
90  GPTLstop("POL_GATHER_MIRRORBACK");
91 }
92 
93 template<class Device>
94 void poloidal_gather(const DomainDecomposition<DeviceType>& pol_decomp, const View<double***, CLayout, Device>& full_view){
95  // No operation if n_planes is 1
96  if(pol_decomp.mpi.n_plane_ranks==1) return;
97 
98  GPTLstart("POL_GATHER_SETUP");
99  View<int*,CLayout,HostType> cnts(NoInit("cnts"), pol_decomp.mpi.n_plane_ranks);
100  View<int*,CLayout,HostType> displs(NoInit("displs"), pol_decomp.mpi.n_plane_ranks);
101 
102  // Create cnts array with # vertex nodes on each processor
103  int nv = full_view.extent(0)*full_view.extent(2); // nvr*nvz
104  for(int i = 0; i<cnts.size(); i++){
105  displs(i) = nv*(pol_decomp.gvid0_pid_h(i) - 1); // gvid0_pid_h is 1-indexed
106  cnts(i) = nv*(pol_decomp.gvid0_pid_h(i+1) - pol_decomp.gvid0_pid_h(i));
107  }
108 
109  // Note: These are currently not the same as pol_decomp.node_offset and pol_decomp.nnodes. That's confusing! ALS
110  int my_node_send_count = pol_decomp.gvid0_pid_h(pol_decomp.mpi.my_plane_rank+1) - pol_decomp.gvid0_pid_h(pol_decomp.mpi.my_plane_rank);
111  int my_send_count = my_node_send_count*nv;
112  int my_node_offset = pol_decomp.gvid0_pid_h(pol_decomp.mpi.my_plane_rank) - 1;
113 
114  GPTLstop("POL_GATHER_SETUP");
115 
116  // Need to transpose first; do this in a temporary array
117  GPTLstart("POL_GATHER_ALLOC1");
118  View<double***, CLayout, Device> tmp(NoInit("tmp"), my_node_send_count, full_view.extent(0),full_view.extent(2));
119  GPTLstop("POL_GATHER_ALLOC1");
120 
121  GPTLstart("POL_GATHER_TRANSP1");
122  Kokkos::parallel_for("transpose", Kokkos::RangePolicy<typename Device::execution_space>( 0, tmp.extent(0)), KOKKOS_LAMBDA(const int inode){
123  for(int imu = 0; imu < tmp.extent(1); imu++){
124  for(int ivp = 0; ivp < tmp.extent(2); ivp++){
125  tmp(inode, imu, ivp) = full_view(imu, inode+my_node_offset, ivp);
126  }
127  }
128  });
129  Kokkos::fence();
130  GPTLstop("POL_GATHER_TRANSP1");
131 
132  //using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here
133 
134  GPTLstart("POL_GATHER_MIRROR");
135  // Cast to 1D with an unmanaged view for a generic element-wise operation, then copy to host if on GPU and GPU-aware MPI is not available
136  View<double*,CLayout, Device, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(tmp.data()), tmp.size());
137  auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIDeviceType());
138  auto f_mpi = my_mirror_view(f_1D, MPIDeviceType(), f_mpi_d);
139  mirror_copy(f_mpi, f_1D);
140 
141  // Now do the same for the output view, copying into a temporary view for the transpose
142  View<double***, CLayout, Device> full_tmp(NoInit("full_tmp"), full_view.extent(1), full_view.extent(0), full_view.extent(2));
143  View<double*,CLayout, Device, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D_out((double*)(full_tmp.data()), full_tmp.size());
144  auto f_mpi_d_out = my_mirror_scratch_view(f_1D_out,MPIDeviceType());
145  auto f_mpi_out = my_mirror_view(f_1D_out, MPIDeviceType(), f_mpi_d_out);
146  // No copy since output view will be overwritten
147  GPTLstop("POL_GATHER_MIRROR");
148 
149  GPTLstart("POL_GATHER_MPI");
150  MPI_Allgatherv(f_mpi.data(), my_send_count, MPI_DOUBLE, f_mpi_out.data(), cnts.data(), displs.data(), MPI_DOUBLE, pol_decomp.mpi.plane_comm);
151  GPTLstop("POL_GATHER_MPI");
152 
153  GPTLstart("POL_GATHER_MIRRORBACK");
154  // Copy back
155  mirror_copy(f_1D_out, f_mpi_out);
156  GPTLstop("POL_GATHER_MIRRORBACK");
157 
158  // Transpose back into full_view
159  GPTLstart("POL_GATHER_TRANSP2");
160  Kokkos::parallel_for("transpose_back", Kokkos::RangePolicy<ExSpace>( 0, full_tmp.extent(0)), KOKKOS_LAMBDA(const int inode){
161  for(int imu = 0; imu < full_tmp.extent(1); imu++){
162  for(int ivp = 0; ivp < full_tmp.extent(2); ivp++){
163  full_view(imu, inode, ivp) = full_tmp(inode, imu, ivp);
164  }
165  }
166  });
167  Kokkos::fence();
168  GPTLstop("POL_GATHER_TRANSP2");
169 }
170 
171 #endif
void poloidal_gather(const DomainDecomposition< DeviceType > &pol_decomp, const View< double *, CLayout, Device > &full_view)
Definition: poloidal_sum.hpp:58
static int GPTLstart(const char *name)
Definition: timer_macro.hpp:9
void mirror_copy(T1 &view_dest, const T2 &view_src)
Definition: my_mirror_view.hpp:122
Kokkos::Device< HostExSpace, HostMemSpace > HostType
Definition: space_settings.hpp:57
View< T *, CLayout, Device > my_mirror_scratch_view(const View< T *, CLayout, Device, Kokkos::MemoryTraits< Kokkos::Unmanaged >> &view, Device nd)
Definition: my_mirror_view.hpp:97
void poloidal_bcast(const DomainDecomposition< DeviceType > &pol_decomp, const View< double *, CLayout, Device > &full_view, int ROOT_RANK)
Definition: poloidal_sum.hpp:35
View< T *, CLayout, Device > my_mirror_view(const View< T *, CLayout, Device > &view, Device nd)
Definition: my_mirror_view.hpp:14
void poloidal_sum_in_place(const DomainDecomposition< DeviceType > &pol_decomp, T &f)
Definition: poloidal_sum.hpp:10
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252
HostType MPIDeviceType
Definition: space_settings.hpp:63
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:69
DistributionPlan mpi_distribution_plan(int nv) const
static int GPTLstop(const char *name)
Definition: timer_macro.hpp:10
Kokkos::View< int *, Kokkos::LayoutRight, HostType > gvid0_pid_h
Which processors get which vertices (host)
Definition: domain_decomposition.hpp:92