XGCa
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
plane_field_gatherer.hpp
Go to the documentation of this file.
1 #ifndef PLANE_FIELD_GATHERER_HPP
2 #define PLANE_FIELD_GATHERER_HPP
3 
5 #include "grid.hpp"
6 #include "my_subview.hpp"
7 #include "checkpoint.hpp"
8 
12 
14  int nplanes;
16  int nnode;
18 
19  int tmp_nphi;
20  View<Field<VarType::Scalar,PhiInterpType::Planes>**, CLayout,HostType> tmp_s; // for scalar fields
21  View<Field<VarType::Vector,PhiInterpType::Planes>**, CLayout,HostType> tmp_v; // for vector fields
22 
23  View<Field<VarType::Scalar,PhiInterpType::Planes>*, CLayout,HostType> tmp_s_full; // for scalar fields
24  View<Field<VarType::Vector,PhiInterpType::Planes>*, CLayout,HostType> tmp_v_full; // for vector fields
25 
26  /* USE_GPU without USE_GPU_AWARE_MPI requires that a temporary array be allocated
27  */
29 #if defined(USE_GPU) && !defined(USE_GPU_AWARE_MPI)
30  return nplanes;
31 #else
32  return 0;
33 #endif
34  }
35 
36  /* // Series of host MPI_Igatherv to populate phi_ff. An MPI_Allgatherv or other op (or intercommunicator?) might be possible/better.
37  */
38  template<typename T, typename FT>
39  void allgather_to_local_ranks(View<FT*, CLayout,HostType>& tmp_full,T* destination){
40 #ifdef USE_MPI
41  int one_obj_in_dbl = sizeof(FT)/8; // from bytes to doubles
42 
43  View<int*, HostType> recvcounts(NoInit("recvcounts"), pol_decomp.mpi.n_intpl_ranks);
44  View<int*, HostType> displs(NoInit("displs"),pol_decomp.mpi.n_intpl_ranks);
45  std::vector<MPI_Request> rrequests(pol_decomp.mpi.n_intpl_ranks);
46 //printf("\nrank %d, both map: %d, %d \n", pol_decomp.mpi.my_rank, pol_decomp.field_decomp.map_from_global_intpl(0), pol_decomp.field_decomp.map_from_global_intpl(1));
47  // Loop over ranks in global inter-planar communicator, each taking turns as root
48  for(int i_root=0; i_root<pol_decomp.mpi.n_intpl_ranks; i_root++){
49  int root_local_rank = pol_decomp.field_decomp.map_from_global_intpl(i_root); // will range from e.g. 0 to 7
50  int root_local_first_plane = pol_decomp.field_decomp.all_first_plane(root_local_rank);
51  int root_local_last_plane = pol_decomp.field_decomp.all_last_plane(root_local_rank);
52  int root_local_nplanes = positive_modulo(root_local_last_plane - root_local_first_plane,grid.nplanes) + 1;
53 
54  int root_local_first_node = pol_decomp.field_decomp.all_first_node(root_local_rank);
55  int root_local_nnode = pol_decomp.field_decomp.all_last_node(root_local_rank) - pol_decomp.field_decomp.all_first_node(root_local_rank) + 1;
56  int size = root_local_nnode * one_obj_in_dbl;
57 
58  // Now determine where each contributing rank will send its contribution
59  // Initialize to zero
60  Kokkos::deep_copy(recvcounts, 0);
61  Kokkos::deep_copy(displs, 0);
62 
63  // Note that the data arrives OUT OF ORDER to handle the case where the ghost planes wrap around plane 0
64  for(int i=0; i<root_local_nplanes; i++){
65  int contributor = (i+root_local_first_plane)%grid.nplanes;
66  recvcounts(contributor) = size;
67  displs(contributor) = i*size;
68  //if(i<displs.size()-1) displs(i+1) = displs(i) + recvcounts(contributor);
69  }
70 
71  int my_size = recvcounts(pol_decomp.mpi.my_intpl_rank);
72 
73 /*checkpoint("");
74 checkpoint("");
75 for(int ipr = 0; ipr<pol_decomp.mpi.nranks; ipr++){
76  if(pol_decomp.mpi.my_rank==ipr){
77  printf("\nRANK %d, plane_rank %d, intpl_rank %d:", ipr, pol_decomp.mpi.my_plane_rank, pol_decomp.mpi.my_intpl_rank);
78  printf("\nnode_offset: %d, roots: %d, root_local_nnode: %d", node_offset, root_local_first_node, root_local_nnode);
79  printf("\nsize: %d, my_size: %d", size, my_size);
80  printf("\nroot(%d) rank: %d, first planes: %d, last plane: %d, n_planes: %d", i_root, root_local_rank, root_local_first_plane, root_local_last_plane, root_local_nplanes);
81  printf("\nrecvcounts = :");
82  for(int i=0; i<recvcounts.size(); i++){
83  printf(" %d", recvcounts(i));
84  }
85  printf("\ndispl = :");
86  for(int i=0; i<displs.size(); i++){
87  printf(" %d", displs(i));
88  }
89  }
90 checkpoint("");
91 checkpoint("");
92 }*/
93 
94  MPI_Igatherv(tmp_full.data()+root_local_first_node, my_size, MPI_DOUBLE, destination, recvcounts.data(), displs.data(), MPI_DOUBLE, i_root, pol_decomp.mpi.intpl_comm, &(rrequests[i_root]));
95 //MPI_Wait(&(rrequests[i_root]), MPI_STATUS_IGNORE);
96 
97 //checkpoint("Finished delivering to an intpl rank");
98  }
99  // Wait for Igatherv to complete
100  for(int i_root=0; i_root<pol_decomp.mpi.n_intpl_ranks; i_root++) MPI_Wait(&(rrequests[i_root]), MPI_STATUS_IGNORE);
101 #endif
102  }
103 
104  /* Copy the irho==0 index of a rho_ff field
105  */
106  template<typename FT>
107  void copy_to_tmp_full(View<FT*, CLayout,HostType>& tmp_full, const View<FT**, CLayout,HostType>& rho_ff_h){
108  constexpr int ZERO_GYRO = 0;
109 
110  // Check that the host array is allocated
111  if(rho_ff_h.extent(0) != grid.nnode || rho_ff_h.extent(1)<1) exit_XGC("\nError in gather_phi_ff_on_device: expected host view of size (nnode,nrho)\n");
112 
113  // Copy this rank's contribution to tmp_full
114  Kokkos::parallel_for("gather_field_info", Kokkos::RangePolicy<HostExSpace>( 0, grid.nnode), [=] (const int inode){
115  tmp_full(inode)=rho_ff_h(inode,ZERO_GYRO);
116  });
117  }
118 
119  /* Copy a ff field
120  */
121  template<typename FT>
122  void copy_to_tmp_full(View<FT*, CLayout,HostType>& tmp_full, const View<FT*, CLayout,HostType>& ff_h){
123  // Check that the host array is allocated
124  if(ff_h.extent(0) != grid.nnode) exit_XGC("\nError in gather_phi_ff_on_device: expected host view of size (nnode,nrho)\n");
125 
126  // Copy this rank's contribution to tmp_full
127  Kokkos::parallel_for("gather_field_info", Kokkos::RangePolicy<HostExSpace>( 0, grid.nnode), [=] (const int inode){
128  tmp_full(inode)=ff_h(inode);
129  });
130  }
131 
132  template<typename T_h, typename FT>
133  void gather_phi_ff_on_device(View<FT**, CLayout,HostType>& tmp, View<FT*, CLayout,HostType>& tmp_full, const T_h& rho_ff_h, View<FT**, CLayout,DeviceType>& phi_ff){
134 #if defined(USE_GPU) && !defined(USE_GPU_AWARE_MPI)
135  constexpr bool gpu_without_gpu_aware_MPI = true;
136 #else
137  constexpr bool gpu_without_gpu_aware_MPI = false;
138 #endif
139 
140  // Copy this rank's contribution to tmp_full
141  copy_to_tmp_full(tmp_full, rho_ff_h);
142 
143  // Allocate phi_ff device view
144  phi_ff = View<FT**, CLayout,DeviceType>("gfpack_view",nplanes, nnode);
145 
146  // If using GPUs without GPU-aware MPI, need to do MPI gather into a temporary host allocation
147  auto destination = (gpu_without_gpu_aware_MPI ? tmp.data() : phi_ff.data());
148 //checkpoint("");
149 //checkpoint("");
150 #ifdef USE_MPI
151 /*for(int ipr = 0; ipr<pol_decomp.mpi.nranks; ipr++){
152  if(pol_decomp.mpi.my_rank==ipr){
153  printf("\nRANK %d:", ipr);
154  printf("\nphi_ff(%d, %d)\n", phi_ff.extent(0), phi_ff.extent(1));
155  printf("\ngpu_without_gpu_aware_MPI=%d\n", gpu_without_gpu_aware_MPI?1 : 0);
156  printf("\ntmp(%d, %d)\n", tmp.extent(0), tmp.extent(1));
157  printf("\ntmp_full(%d)\n", tmp_full.extent(0));
158  }
159 checkpoint("");
160 checkpoint("");
161 }*/
162 #endif
163  // Gather contributions from each rank; if we're just gathering a subset of the field, the logic is more complicated
164  if(gather_subset){
165  allgather_to_local_ranks(tmp_full, destination);
166  }else{
167 #ifdef USE_MPI
168  int one_obj_in_dbl = sizeof(FT)/8; // from bytes to doubles
169  int sz=nnode * one_obj_in_dbl;
170  MPI_Allgather(tmp_full.data(), sz, MPI_DOUBLE, destination, sz, MPI_DOUBLE, pol_decomp.mpi.intpl_comm);
171 #else
172  // Used for mini_appKernel; just replicate the rho field for now
173  if(gpu_without_gpu_aware_MPI)
174  for(int i=0; i<grid.nplanes; i++) Kokkos::deep_copy(my_subview(tmp, i), tmp_full);
175  else
176  for(int i=0; i<grid.nplanes; i++) Kokkos::deep_copy(my_subview(phi_ff, i), tmp_full);
177 #endif
178  }
179 //checkpoint("Entering copy");
180  // If using GPUs without GPU-aware MPI, copy from the temporary allocation to device memory
181  if(gpu_without_gpu_aware_MPI){
182  Kokkos::deep_copy(phi_ff, tmp);
183  }
184 //checkpoint("Finished copy");
185  }
186 
187  template<typename FT>
188  inline View<FT**, CLayout,HostType>& which_tmp();
189 
190  template<typename FT>
191  inline View<FT*, CLayout,HostType>& which_tmp_full();
192 
193  public:
194 
196  : pol_decomp(pol_decomp),
197  grid(grid),
198  gather_subset(pol_decomp.decompose_fields),
199  // If using field_decomposition, we don't copy the entire field
200  nplanes(gather_subset ? pol_decomp.field_decomp.n_planes : grid.nplanes),
201  plane_offset(gather_subset ? pol_decomp.field_decomp.first_plane : 0),
202  nnode(gather_subset ? pol_decomp.field_decomp.n_nodes : grid.nnode),
203  node_offset(gather_subset ? pol_decomp.field_decomp.first_node : 0),
204  tmp_nphi(choose_tmp_nphi()),
205  tmp_s("tmp_s", tmp_nphi, nnode),
206  tmp_v("tmp_v", tmp_nphi, nnode),
207  tmp_s_full("tmp_s_full", grid.nnode),
208  tmp_v_full("tmp_v_full", grid.nnode)
209  {
210 //printf("\nplane_offset: %d, node_offset: %d\n", plane_offset, node_offset);
211 //printf("\ntmp_nphi: %d, tmp_s(%d, %d), tmp_v(%d, %d)\n", tmp_nphi, tmp_s.extent(0), tmp_s.extent(1), tmp_v.extent(0), tmp_v.extent(1));
212 }
213 
214  // Interface to use correct tmp
215  template<typename T_h, typename FT>
216  void gather_phi_ff_on_device(const T_h& rho_ff_h, View<FT**, CLayout,DeviceType>& phi_ff){
217  gather_phi_ff_on_device(which_tmp<FT>(), which_tmp_full<FT>(), rho_ff_h, phi_ff);
218  }
219 };
220 
221 template<>
222 inline View<Field<VarType::Scalar,PhiInterpType::Planes>**, CLayout,HostType>& PlaneFieldGatherer::which_tmp(){
223  return tmp_s;
224 }
225 
226 template<>
227 inline View<Field<VarType::Vector,PhiInterpType::Planes>**, CLayout,HostType>& PlaneFieldGatherer::which_tmp(){
228  return tmp_v;
229 }
230 
231 template<>
232 inline View<Field<VarType::Scalar,PhiInterpType::Planes>*, CLayout,HostType>& PlaneFieldGatherer::which_tmp_full(){
233  return tmp_s_full;
234 }
235 
236 template<>
237 inline View<Field<VarType::Vector,PhiInterpType::Planes>*, CLayout,HostType>& PlaneFieldGatherer::which_tmp_full(){
238  return tmp_v_full;
239 }
240 
241 #endif
View< Field< VarType::Scalar, PhiInterpType::Planes > **, CLayout, HostType > tmp_s
Definition: plane_field_gatherer.hpp:20
bool gather_subset
Definition: plane_field_gatherer.hpp:13
View< int *, CLayout, HostType > all_last_node
Last node of each rank.
Definition: field_decomposition.hpp:42
int nnode
Definition: plane_field_gatherer.hpp:16
void gather_phi_ff_on_device(const T_h &rho_ff_h, View< FT **, CLayout, DeviceType > &phi_ff)
Definition: plane_field_gatherer.hpp:216
View< FT **, CLayout, HostType > & which_tmp()
Kokkos::Device< HostExSpace, HostMemSpace > HostType
Definition: space_settings.hpp:56
Definition: plane_field_gatherer.hpp:9
View< Field< VarType::Scalar, PhiInterpType::Planes > *, CLayout, HostType > tmp_s_full
Definition: plane_field_gatherer.hpp:23
View< int *, CLayout, HostType > map_from_global_intpl
Rank in this communicator for each rank global intpl.
Definition: field_decomposition.hpp:40
int nplanes
Number of planes.
Definition: grid.hpp:84
Kokkos::LayoutRight CLayout
Definition: space_settings.hpp:60
int choose_tmp_nphi()
Definition: plane_field_gatherer.hpp:28
Kokkos::View< T *, Kokkos::LayoutRight, Device > my_subview(const Kokkos::View< T ***, Kokkos::LayoutRight, Device > &view, int i, int j)
Definition: my_subview.hpp:8
View< FT *, CLayout, HostType > & which_tmp_full()
KOKKOS_INLINE_FUNCTION unsigned positive_modulo(int value, unsigned m)
Definition: globals.hpp:131
void copy_to_tmp_full(View< FT *, CLayout, HostType > &tmp_full, const View< FT *, CLayout, HostType > &ff_h)
Definition: plane_field_gatherer.hpp:122
const Grid< DeviceType > grid
Definition: plane_field_gatherer.hpp:11
const DomainDecomposition< DeviceType > pol_decomp
Definition: plane_field_gatherer.hpp:10
void gather_phi_ff_on_device(View< FT **, CLayout, HostType > &tmp, View< FT *, CLayout, HostType > &tmp_full, const T_h &rho_ff_h, View< FT **, CLayout, DeviceType > &phi_ff)
Definition: plane_field_gatherer.hpp:133
View< Field< VarType::Vector, PhiInterpType::Planes > **, CLayout, HostType > tmp_v
Definition: plane_field_gatherer.hpp:21
View< int *, CLayout, HostType > all_first_plane
First plane of each rank.
Definition: field_decomposition.hpp:43
View< Field< VarType::Vector, PhiInterpType::Planes > *, CLayout, HostType > tmp_v_full
Definition: plane_field_gatherer.hpp:24
void allgather_to_local_ranks(View< FT *, CLayout, HostType > &tmp_full, T *destination)
Definition: plane_field_gatherer.hpp:39
void exit_XGC(std::string msg)
Definition: globals.hpp:36
int tmp_nphi
Definition: plane_field_gatherer.hpp:19
View< int *, CLayout, HostType > all_last_plane
Last plane of each rank.
Definition: field_decomposition.hpp:44
int nplanes
Definition: plane_field_gatherer.hpp:14
PlaneFieldGatherer(const DomainDecomposition< DeviceType > &pol_decomp, const Grid< DeviceType > &grid)
Definition: plane_field_gatherer.hpp:195
FieldDecomposition< Device > field_decomp
Definition: domain_decomposition.hpp:35
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252
int nnode
Number of grid nodes.
Definition: grid.hpp:83
void copy_to_tmp_full(View< FT *, CLayout, HostType > &tmp_full, const View< FT **, CLayout, HostType > &rho_ff_h)
Definition: plane_field_gatherer.hpp:107
View< int *, CLayout, HostType > all_first_node
First node of each rank.
Definition: field_decomposition.hpp:41
int plane_offset
Definition: plane_field_gatherer.hpp:15
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:61
int node_offset
Definition: plane_field_gatherer.hpp:17