XGCa
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
poloidal_sum.hpp
Go to the documentation of this file.
1 #ifndef POLOIDAL_SUM_HPP
2 #define POLOIDAL_SUM_HPP
3 
4 // Does a copy to the MPI memory space if different from the View's memory space
5 template<class T>
7  // No operation if n_planes is 1
8  if(pol_decomp.mpi.n_plane_ranks==1) return;
9 
10  // Cast to 1D with an unmanaged view for a generic element-wise operation
11  View<double*,CLayout, typename T::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(f.data()), f.size());
12 
13  using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here
14 
15  // Copy to host if on GPU and GPU-aware MPI is not available
16  auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIHostType());
17  auto f_mpi = my_mirror_view(f_1D, MPIHostType(), f_mpi_d);
18  mirror_copy(f_mpi, f_1D);
19 
20  /*** Sum all planes ***/
21  MPI_Allreduce(MPI_IN_PLACE, f_mpi.data(), f_mpi.size(), MPI_DOUBLE, MPI_SUM, pol_decomp.mpi.plane_comm);
22 
23  // Copy back
24  mirror_copy(f_1D, f_mpi);
25 
26  Kokkos::fence();
27 }
28 
29 
30 void poloidal_gather(const DomainDecomposition<DeviceType>& pol_decomp, const View<double***, CLayout, DeviceType>& full_view){
31  // No operation if n_planes is 1
32  if(pol_decomp.mpi.n_plane_ranks==1) return;
33 
34  GPTLstart("POL_GATHER_SETUP");
35  View<int*,CLayout,HostType> cnts(NoInit("cnts"), pol_decomp.mpi.n_plane_ranks);
36  View<int*,CLayout,HostType> displs(NoInit("displs"), pol_decomp.mpi.n_plane_ranks);
37 
38  // Create cnts array with # vertex nodes on each processor
39  int nv = full_view.extent(0)*full_view.extent(2); // nvr*nvz
40  for(int i = 0; i<cnts.size(); i++){
41  displs(i) = nv*(pol_decomp.gvid0_pid_h(i) - 1); // gvid0_pid_h is 1-indexed
42  cnts(i) = nv*(pol_decomp.gvid0_pid_h(i+1) - pol_decomp.gvid0_pid_h(i));
43  }
44 
45  // Note: These are currently not the same as pol_decomp.node_offset and pol_decomp.nnodes. That's confusing! ALS
46  int my_node_send_count = pol_decomp.gvid0_pid_h(pol_decomp.mpi.my_plane_rank+1) - pol_decomp.gvid0_pid_h(pol_decomp.mpi.my_plane_rank);
47  int my_send_count = my_node_send_count*nv;
48  int my_node_offset = pol_decomp.gvid0_pid_h(pol_decomp.mpi.my_plane_rank) - 1;
49 
50  GPTLstop("POL_GATHER_SETUP");
51 
52  // Need to transpose first; do this in a temporary array
53  GPTLstart("POL_GATHER_ALLOC1");
54  View<double***, CLayout, DeviceType> tmp(NoInit("tmp"), my_node_send_count, full_view.extent(0),full_view.extent(2));
55  GPTLstop("POL_GATHER_ALLOC1");
56 
57  GPTLstart("POL_GATHER_TRANSP1");
58  Kokkos::parallel_for("transpose", Kokkos::RangePolicy<ExSpace>( 0, tmp.extent(0)), KOKKOS_LAMBDA(const int inode){
59  for(int imu = 0; imu < tmp.extent(1); imu++){
60  for(int ivp = 0; ivp < tmp.extent(2); ivp++){
61  tmp(inode, imu, ivp) = full_view(imu, inode+my_node_offset, ivp);
62  }
63  }
64  });
65  Kokkos::fence();
66  GPTLstop("POL_GATHER_TRANSP1");
67 
68  using MPIHostType = HostType; // Do this MPI_Allreduce on Host; GPU-aware MPI consistently hangs or aborts here
69 
70  GPTLstart("POL_GATHER_MIRROR");
71  // Cast to 1D with an unmanaged view for a generic element-wise operation, then copy to host if on GPU and GPU-aware MPI is not available
72  View<double*,CLayout, DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D((double*)(tmp.data()), tmp.size());
73  auto f_mpi_d = my_mirror_scratch_view(f_1D,MPIHostType());
74  auto f_mpi = my_mirror_view(f_1D, MPIHostType(), f_mpi_d);
75  mirror_copy(f_mpi, f_1D);
76 
77  // Now do the same for the output view, copying into a temporary view for the transpose
78  View<double***, CLayout, DeviceType> full_tmp(NoInit("full_tmp"), full_view.extent(1), full_view.extent(0), full_view.extent(2));
79  View<double*,CLayout, DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged>> f_1D_out((double*)(full_tmp.data()), full_tmp.size());
80  auto f_mpi_d_out = my_mirror_scratch_view(f_1D_out,MPIHostType());
81  auto f_mpi_out = my_mirror_view(f_1D_out, MPIHostType(), f_mpi_d_out);
82  // No copy since output view will be overwritten
83  GPTLstop("POL_GATHER_MIRROR");
84 
85  GPTLstart("POL_GATHER_MPI");
86  MPI_Allgatherv(f_mpi.data(), my_send_count, MPI_DOUBLE, f_mpi_out.data(), cnts.data(), displs.data(), MPI_DOUBLE, pol_decomp.mpi.plane_comm);
87  GPTLstop("POL_GATHER_MPI");
88 
89  GPTLstart("POL_GATHER_MIRRORBACK");
90  // Copy back
91  mirror_copy(f_1D_out, f_mpi_out);
92  GPTLstop("POL_GATHER_MIRRORBACK");
93 
94  // Transpose back into full_view
95  GPTLstart("POL_GATHER_TRANSP2");
96  Kokkos::parallel_for("transpose_back", Kokkos::RangePolicy<ExSpace>( 0, full_tmp.extent(0)), KOKKOS_LAMBDA(const int inode){
97  for(int imu = 0; imu < full_tmp.extent(1); imu++){
98  for(int ivp = 0; ivp < full_tmp.extent(2); ivp++){
99  full_view(imu, inode, ivp) = full_tmp(inode, imu, ivp);
100  }
101  }
102  });
103  Kokkos::fence();
104  GPTLstop("POL_GATHER_TRANSP2");
105 }
106 
107 #endif
static int GPTLstart(const char *name)
Definition: timer_macro.hpp:9
void mirror_copy(T1 &view_dest, const T2 &view_src)
Definition: my_mirror_view.hpp:122
Kokkos::Device< HostExSpace, HostMemSpace > HostType
Definition: space_settings.hpp:57
View< T *, CLayout, Device > my_mirror_scratch_view(const View< T *, CLayout, Device, Kokkos::MemoryTraits< Kokkos::Unmanaged >> &view, Device nd)
Definition: my_mirror_view.hpp:97
void poloidal_gather(const DomainDecomposition< DeviceType > &pol_decomp, const View< double ***, CLayout, DeviceType > &full_view)
Definition: poloidal_sum.hpp:30
View< T *, CLayout, Device > my_mirror_view(const View< T *, CLayout, Device > &view, Device nd)
Definition: my_mirror_view.hpp:14
void poloidal_sum_in_place(const DomainDecomposition< DeviceType > &pol_decomp, T &f)
Definition: poloidal_sum.hpp:6
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:69
static int GPTLstop(const char *name)
Definition: timer_macro.hpp:10
Kokkos::View< int *, Kokkos::LayoutRight, HostType > gvid0_pid_h
Which processors get which vertices (host)
Definition: domain_decomposition.hpp:63