XGCa
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
plane_field_gatherer.hpp
Go to the documentation of this file.
1 #ifndef PLANE_FIELD_GATHERER_HPP
2 #define PLANE_FIELD_GATHERER_HPP
3 
5 #include "grid.hpp"
6 #include "my_subview.hpp"
7 #include "get_potential_grad.hpp"
8 
12 
16  int nplanes;
17  int nnode;
18 
19  int tmp_nphi;
20  View<Field<VarType::Scalar,PIT_GLOBAL>**, CLayout,HostType> tmp_s; // for scalar fields
21  View<Field<VarType::Vector,PIT_GLOBAL>**, CLayout,HostType> tmp_v; // for vector fields
22 
23  View<Field<VarType::Scalar,PIT_GLOBAL>*, CLayout,HostType> tmp_s_full; // for scalar fields
24  View<Field<VarType::Vector,PIT_GLOBAL>*, CLayout,HostType> tmp_v_full; // for vector fields
25 
26  /* USE_GPU without USE_GPU_AWARE_MPI requires that a temporary array be allocated
27  */
29 #if defined(USE_GPU) && !defined(USE_GPU_AWARE_MPI)
30  return nplanes;
31 #else
32  return 0;
33 #endif
34  }
35 
36  /* // Series of host MPI_Igatherv to populate phi_ff. An MPI_Allgatherv or other op (or intercommunicator?) might be possible/better.
37  */
38  template<typename T, typename FT>
39  void allgather_to_local_ranks(View<FT*, CLayout,HostType>& tmp_full,T* destination){
40 #ifdef USE_MPI
41  int one_obj_in_dbl = sizeof(FT)/8; // from bytes to doubles
42 
43  View<int*, HostType> recvcounts(NoInit("recvcounts"), pol_decomp.mpi.n_intpl_ranks);
44  View<int*, HostType> displs(NoInit("displs"),pol_decomp.mpi.n_intpl_ranks);
45  std::vector<MPI_Request> rrequests(pol_decomp.mpi.n_intpl_ranks);
46 
47  // Loop over ranks in global inter-planar communicator, each taking turns as root
48  for(int i_root=0; i_root<pol_decomp.mpi.n_intpl_ranks; i_root++){
49  int root_local_rank = pol_decomp.field_decomp.map_from_global_intpl(i_root); // will range from e.g. 0 to 7
51 
52  // Gather from the ranks that hold the assigned planes, or from the ranks that hold the near field planes if gathering the near field
53  int source_rank = gather_near_field ? root_near_field_pid : root_local_rank;
54 
55  int root_local_first_plane = pol_decomp.field_decomp.all_first_plane(source_rank);
56  int root_local_last_plane = pol_decomp.field_decomp.all_last_plane(source_rank);
57  int root_local_nplanes = positive_modulo(root_local_last_plane - root_local_first_plane,grid.nplanes) + 1;
58 
59  int root_local_first_node = pol_decomp.field_decomp.all_first_node(source_rank);
60  int root_local_nnode = pol_decomp.field_decomp.all_last_node(source_rank) - pol_decomp.field_decomp.all_first_node(source_rank) + 1;
61 
62  int size = root_local_nnode * one_obj_in_dbl;
63 
64  // Now determine where each contributing rank will send its contribution
65  // Initialize to zero
66  Kokkos::deep_copy(recvcounts, 0);
67  Kokkos::deep_copy(displs, 0);
68 
69  // Note that the data arrives OUT OF ORDER to handle the case where the ghost planes wrap around plane 0
70  for(int i=0; i<root_local_nplanes; i++){
71  int contributor = (i+root_local_first_plane)%grid.nplanes;
72  recvcounts(contributor) = size;
73  displs(contributor) = i*size;
74  //if(i<displs.size()-1) displs(i+1) = displs(i) + recvcounts(contributor);
75  }
76 
77  int my_size = recvcounts(pol_decomp.mpi.my_intpl_rank);
78 
79  MPI_Igatherv(tmp_full.data()+root_local_first_node, my_size, MPI_DOUBLE, destination, recvcounts.data(), displs.data(), MPI_DOUBLE, i_root, pol_decomp.mpi.intpl_comm, &(rrequests[i_root]));
80  }
81  // Wait for Igatherv to complete
82  for(int i_root=0; i_root<pol_decomp.mpi.n_intpl_ranks; i_root++) MPI_Wait(&(rrequests[i_root]), MPI_STATUS_IGNORE);
83 #endif
84  }
85 
86  /* Copy the irho==0 index of a rho_ff field
87  */
88  template<typename FT>
89  void copy_to_tmp_full(View<FT*, CLayout,HostType>& tmp_full, const View<FT**, CLayout,HostType>& rho_ff_h){
90  constexpr int ZERO_GYRO = 0;
91 
92  // Check that the host array is allocated
93  if(rho_ff_h.extent(0) != grid.nnode || rho_ff_h.extent(1)<1) exit_XGC("\nError in gather_phi_ff_on_device: expected host view of size (nnode,nrho)\n");
94 
95  // Copy this rank's contribution to tmp_full
96  Kokkos::parallel_for("gather_field_info", Kokkos::RangePolicy<HostExSpace>( 0, grid.nnode), [=] (const int inode){
97  tmp_full(inode)=rho_ff_h(inode,ZERO_GYRO);
98  });
99  }
100 
101  /* Copy a ff field
102  */
103  template<typename FT>
104  void copy_to_tmp_full(View<FT*, CLayout,HostType>& tmp_full, const View<FT*, CLayout,HostType>& ff_h){
105  // Check that the host array is allocated
106  if(ff_h.extent(0) != grid.nnode) exit_XGC("\nError in gather_phi_ff_on_device: expected host view of size (nnode,nrho)\n");
107 
108  // Copy this rank's contribution to tmp_full
109  Kokkos::parallel_for("gather_field_info", Kokkos::RangePolicy<HostExSpace>( 0, grid.nnode), [=] (const int inode){
110  tmp_full(inode)=ff_h(inode);
111  });
112  }
113 
114  public:
115 
116  template<typename GT0, typename GT, typename GT2>
117  void calculate_phi_ff_on_device(const Simulation<DeviceType>& sml, const Grid<DeviceType>& grid, const MagneticField<DeviceType>& magnetic_field, Smoothing& smoothing, GetPotentialGradTemp<DeviceType, DeviceType>& tmp, const View<Field<VarType::Scalar,PhiInterpType::None>*,CLayout,HostType>& dpot_h, const GT0& pot0_h, GT& phi_ff, GT2& pot_phi_ff, bool potential_is_requested=true, bool use_field00=false, bool ignore_poloidal_dpot=false){
118 
119  // Need 3 ghost planes for this algorithm
120  int n_initial_ghost_planes = 2;
121  int n_final_ghost_planes = 1;
122  int n_ghost_planes = n_initial_ghost_planes + n_final_ghost_planes;
123 
124  GPTLstart("PRE_GATHER_POT");
125 
126  // Allocate phi_ff device view
127  phi_ff = GT("gfpack_view",nplanes, nnode);
128 
129  // Make an unmanaged view to reuse the phi_ff allocation:
130  auto* ptr_to_end_of_phi_ff = phi_ff.data()+phi_ff.size();
131  int size_of_dpot_phi = (nplanes+n_ghost_planes)*nnode;
132  double* dpot_phi_addr = (double*)(ptr_to_end_of_phi_ff) - size_of_dpot_phi;
133  View<double**,CLayout,DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged>> dpot_phi(dpot_phi_addr, nplanes+n_ghost_planes, nnode);
134  GPTLstop("PRE_GATHER_POT");
135 
136  GPTLstart("GATHER_POT");
137 #ifdef USE_MPI
138  // Gather the potential from all planes, leaving room at the beginning of the array for two ghost planes
139 
140  // Mirror views for when GPU-aware MPI is not available
141  auto pot_tmp = my_mirror_view(dpot_h, MPIDeviceType());
142  mirror_copy(pot_tmp, dpot_h);
143 
144  auto dpot_phi_MPI_d = my_mirror_scratch_view(dpot_phi,MPIDeviceType());
145  auto dpot_phi_tmp = my_mirror_view(dpot_phi, MPIDeviceType(), dpot_phi_MPI_d);
146  auto destination = dpot_phi_tmp.data() + nnode*n_initial_ghost_planes; // 2 initial ghost planes
147  MPI_Allgather(pot_tmp.data(), nnode, MPI_DOUBLE, destination, nnode, MPI_DOUBLE, pol_decomp.mpi.intpl_comm);
148  mirror_copy(dpot_phi, dpot_phi_tmp); // Send to device
149 #endif
150  GPTLstop("GATHER_POT");
151 
152  // Incoming view input_field has size (nplanes+3): 2 ghost planes at the beginning, 1 at the end
153 
154  // If the ghost planes wrap around, copy them
155  // e.g. with 8 planes named Plane 0-7:
156  // dpot_phi(0) is a copy of Plane 6
157  // dpot_phi(1) is a copy of Plane 7
158  // dpot_phi(10) is a copy of Plane 0
159  int nplanes_l = nplanes;
160  Kokkos::parallel_for("set_up_ghost_planes", Kokkos::RangePolicy<ExSpace>(0,grid.nnode), KOKKOS_LAMBDA( const int i ){
161  dpot_phi(0,i) = dpot_phi(nplanes_l-2 + n_initial_ghost_planes, i);
162  dpot_phi(1,i) = dpot_phi(nplanes_l-1 + n_initial_ghost_planes, i);
163  dpot_phi(nplanes_l + n_initial_ghost_planes,i) = dpot_phi(0 + n_initial_ghost_planes, i);
164  });
165  Kokkos::fence();
166 
167  /**** Get potential gradient ****/
168  GPTLstart("CALC_E_FROM_POT");
169 
170  GPTLstart("GET_POT_GRAD_PHI");
171  // Allocate phi_ff device view
172  if(potential_is_requested) pot_phi_ff = GT2("gfpack_potview",nplanes, nnode);
173 
175  (dpot_phi, ignore_poloidal_dpot);
176 
177  // Set up E00
178  if(use_field00){
179  gpg_field_args.field00 = Field00<DeviceType>(pot0_h, sml.grad_psitheta);
180  gpg_field_args.field00.calculate_gradient(grid, tmp.grad_matrices);
181  }
182 
183  // Request potential and/or gradient outputs
184  if(potential_is_requested) gpg_field_args.request_potential(pot_phi_ff);
185  gpg_field_args.request_gradient(phi_ff);
186 
187  // Get requested fields
188  get_field_grad(sml, grid, pol_decomp, magnetic_field, smoothing, gpg_field_args, tmp);
189 
190  GPTLstop("GET_POT_GRAD_PHI");
191  GPTLstop("CALC_E_FROM_POT");
192  }
193 
194  private:
195 
196  template<typename T_h, typename FT>
197  void gather_phi_ff_on_device(View<FT**, CLayout,HostType>& tmp, View<FT*, CLayout,HostType>& tmp_full, const T_h& rho_ff_h, View<FT**, CLayout,DeviceType>& phi_ff){
198 #if defined(USE_GPU) && !defined(USE_GPU_AWARE_MPI)
199  constexpr bool gpu_without_gpu_aware_MPI = true;
200 #else
201  constexpr bool gpu_without_gpu_aware_MPI = false;
202 #endif
203 
204  // Copy this rank's contribution to tmp_full
205  copy_to_tmp_full(tmp_full, rho_ff_h);
206 
207  // Allocate phi_ff device view
208  phi_ff = View<FT**, CLayout,DeviceType>("gfpack_view",nplanes, nnode);
209 
210  // If using GPUs without GPU-aware MPI, need to do MPI gather into a temporary host allocation
211  auto destination = (gpu_without_gpu_aware_MPI ? tmp.data() : phi_ff.data());
212 
213  // Gather contributions from each rank; if we're just gathering a subset of the field, the logic is more complicated
214  if(gather_subset){
215  allgather_to_local_ranks(tmp_full, destination);
216  }else{
217 #ifdef USE_MPI
218  int one_obj_in_dbl = sizeof(FT)/8; // from bytes to doubles
219  int sz=nnode * one_obj_in_dbl;
220  MPI_Allgather(tmp_full.data(), sz, MPI_DOUBLE, destination, sz, MPI_DOUBLE, pol_decomp.mpi.intpl_comm);
221 #else
222  // Used for mini_appKernel; just replicate the rho field for now
223  if(gpu_without_gpu_aware_MPI)
224  for(int i=0; i<grid.nplanes; i++) Kokkos::deep_copy(my_subview(tmp, i), tmp_full);
225  else
226  for(int i=0; i<grid.nplanes; i++) Kokkos::deep_copy(my_subview(phi_ff, i), tmp_full);
227 #endif
228  }
229  // If using GPUs without GPU-aware MPI, copy from the temporary allocation to device memory
230  if(gpu_without_gpu_aware_MPI){
231  Kokkos::deep_copy(phi_ff, tmp);
232  }
233  }
234 
235  template<typename FT>
236  inline View<FT**, CLayout,HostType>& which_tmp();
237 
238  template<typename FT>
239  inline View<FT*, CLayout,HostType>& which_tmp_full();
240 
241  public:
242 
243  PlaneFieldGatherer(const DomainDecomposition<DeviceType>& pol_decomp, const Grid<DeviceType>& grid, bool near_field=false)
244  : pol_decomp(pol_decomp),
245  grid(grid),
246  gather_subset(pol_decomp.decompose_fields),
247  gather_near_field(near_field),
248  near_field_pid((gather_subset && near_field) ? pol_decomp.field_decomp.find_domain_owner(pol_decomp.plane_index, grid.nplanes, pol_decomp.node_offset, grid.nnode) : 0),
249  // If using field_decomposition, we don't copy the entire field
250  nplanes( gather_subset ? (near_field ? pol_decomp.field_decomp.all_n_planes (near_field_pid, grid.nplanes) : pol_decomp.field_decomp.n_planes ) : grid.nplanes),
251  nnode( gather_subset ? (near_field ? pol_decomp.field_decomp.all_n_nodes (near_field_pid) : pol_decomp.field_decomp.n_nodes ) : grid.nnode),
252  tmp_nphi(choose_tmp_nphi()),
253  tmp_s("tmp_s", tmp_nphi, nnode),
254  tmp_v("tmp_v", tmp_nphi, nnode),
255  tmp_s_full("tmp_s_full", grid.nnode),
256  tmp_v_full("tmp_v_full", grid.nnode)
257  {}
258 
259  // Interface to use correct tmp
260  template<typename T_h, typename FT>
261  void gather_phi_ff_on_device(const T_h& rho_ff_h, View<FT**, CLayout,DeviceType>& phi_ff){
262  gather_phi_ff_on_device(which_tmp<FT>(), which_tmp_full<FT>(), rho_ff_h, phi_ff);
263  }
264 };
265 
266 template<>
267 inline View<Field<VarType::Scalar,PIT_GLOBAL>**, CLayout,HostType>& PlaneFieldGatherer::which_tmp(){
268  return tmp_s;
269 }
270 
271 template<>
272 inline View<Field<VarType::Vector,PIT_GLOBAL>**, CLayout,HostType>& PlaneFieldGatherer::which_tmp(){
273  return tmp_v;
274 }
275 
276 template<>
277 inline View<Field<VarType::Scalar,PIT_GLOBAL>*, CLayout,HostType>& PlaneFieldGatherer::which_tmp_full(){
278  return tmp_s_full;
279 }
280 
281 template<>
282 inline View<Field<VarType::Vector,PIT_GLOBAL>*, CLayout,HostType>& PlaneFieldGatherer::which_tmp_full(){
283  return tmp_v_full;
284 }
285 
286 #endif
bool gather_subset
Definition: plane_field_gatherer.hpp:13
static int GPTLstart(const char *name)
Definition: timer_macro.hpp:9
View< Field< VarType::Scalar, PIT_GLOBAL > *, CLayout, HostType > tmp_s_full
Definition: plane_field_gatherer.hpp:23
View< int *, CLayout, HostType > all_last_node
Last node of each rank.
Definition: field_decomposition.hpp:42
void mirror_copy(T1 &view_dest, const T2 &view_src)
Definition: my_mirror_view.hpp:122
int nnode
Definition: plane_field_gatherer.hpp:17
void gather_phi_ff_on_device(const T_h &rho_ff_h, View< FT **, CLayout, DeviceType > &phi_ff)
Definition: plane_field_gatherer.hpp:261
Definition: sml.hpp:8
View< FT **, CLayout, HostType > & which_tmp()
Kokkos::Device< HostExSpace, HostMemSpace > HostType
Definition: space_settings.hpp:56
Definition: globals.hpp:89
Definition: plane_field_gatherer.hpp:9
KOKKOS_INLINE_FUNCTION int find_domain_owner(int global_plane_index, int nplanes_total, int global_node_index, int nnodes_total) const
Definition: field_decomposition.hpp:136
Definition: magnetic_field.hpp:12
Definition: get_potential_grad.hpp:285
View< int *, CLayout, HostType > map_from_global_intpl
Rank in this communicator for each rank global intpl.
Definition: field_decomposition.hpp:40
void get_field_grad(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const DomainDecomposition< DeviceType > &pol_decomp, const MagneticField< DeviceType > &magnetic_field, Smoothing &smoothing, GetPotGradFieldArgs< DeviceIn, DeviceOut, VT, PIT, TT, KT > &args, GetPotentialGradTemp< DeviceType, DeviceOut > &tmp)
Definition: get_potential_grad.cpp:395
View< Field< VarType::Vector, PIT_GLOBAL > *, CLayout, HostType > tmp_v_full
Definition: plane_field_gatherer.hpp:24
int nplanes
Number of planes.
Definition: grid.hpp:160
int node_offset
Offset of first mesh node belonging to this MPI rank.
Definition: domain_decomposition.hpp:59
Kokkos::LayoutRight CLayout
Definition: space_settings.hpp:67
int choose_tmp_nphi()
Definition: plane_field_gatherer.hpp:28
GradientMatrices< DeviceType > grad_matrices
Definition: get_potential_grad.hpp:74
View< FT *, CLayout, HostType > & which_tmp_full()
KOKKOS_INLINE_FUNCTION unsigned positive_modulo(int value, unsigned m)
Definition: globals.hpp:211
bool grad_psitheta
Definition: sml.hpp:100
void copy_to_tmp_full(View< FT *, CLayout, HostType > &tmp_full, const View< FT *, CLayout, HostType > &ff_h)
Definition: plane_field_gatherer.hpp:104
View< Field< VarType::Scalar, PIT_GLOBAL > **, CLayout, HostType > tmp_s
Definition: plane_field_gatherer.hpp:20
constexpr PhiInterpType PIT_GLOBAL
Definition: globals.hpp:105
const Grid< DeviceType > grid
Definition: plane_field_gatherer.hpp:11
const DomainDecomposition< DeviceType > pol_decomp
Definition: plane_field_gatherer.hpp:10
Definition: smoothing.hpp:8
void gather_phi_ff_on_device(View< FT **, CLayout, HostType > &tmp, View< FT *, CLayout, HostType > &tmp_full, const T_h &rho_ff_h, View< FT **, CLayout, DeviceType > &phi_ff)
Definition: plane_field_gatherer.hpp:197
View< T *, CLayout, Device > my_mirror_scratch_view(const View< T *, CLayout, Device, Kokkos::MemoryTraits< Kokkos::Unmanaged >> &view, Device nd)
Definition: my_mirror_view.hpp:97
PlaneFieldGatherer(const DomainDecomposition< DeviceType > &pol_decomp, const Grid< DeviceType > &grid, bool near_field=false)
Definition: plane_field_gatherer.hpp:243
bool gather_near_field
Definition: plane_field_gatherer.hpp:14
View< int *, CLayout, HostType > all_first_plane
First plane of each rank.
Definition: field_decomposition.hpp:43
View< T *, CLayout, Device > my_mirror_view(const View< T *, CLayout, Device > &view, Device nd)
Definition: my_mirror_view.hpp:14
void allgather_to_local_ranks(View< FT *, CLayout, HostType > &tmp_full, T *destination)
Definition: plane_field_gatherer.hpp:39
void exit_XGC(std::string msg)
Definition: globals.hpp:37
int tmp_nphi
Definition: plane_field_gatherer.hpp:19
Kokkos::View< T *, Kokkos::LayoutRight, Device > my_subview(const Kokkos::View< T ****, Kokkos::LayoutRight, Device > &view, int i, int j, int k)
Definition: my_subview.hpp:8
View< int *, CLayout, HostType > all_last_plane
Last plane of each rank.
Definition: field_decomposition.hpp:44
int nplanes
Definition: plane_field_gatherer.hpp:16
Definition: magnetic_field.F90:1
int near_field_pid
Definition: plane_field_gatherer.hpp:15
View< Field< VarType::Vector, PIT_GLOBAL > **, CLayout, HostType > tmp_v
Definition: plane_field_gatherer.hpp:21
FieldDecomposition< Device > field_decomp
Definition: domain_decomposition.hpp:56
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252
HostType MPIDeviceType
Definition: space_settings.hpp:62
void calculate_gradient(const Grid< DeviceType > &grid, const GradientMatrices< DeviceType > &grad_matrices)
Definition: get_potential_grad.hpp:153
int nnode
Number of grid nodes.
Definition: grid.hpp:159
void copy_to_tmp_full(View< FT *, CLayout, HostType > &tmp_full, const View< FT **, CLayout, HostType > &rho_ff_h)
Definition: plane_field_gatherer.hpp:89
View< int *, CLayout, HostType > all_first_node
First node of each rank.
Definition: field_decomposition.hpp:41
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:68
void calculate_phi_ff_on_device(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, Smoothing &smoothing, GetPotentialGradTemp< DeviceType, DeviceType > &tmp, const View< Field< VarType::Scalar, PhiInterpType::None > *, CLayout, HostType > &dpot_h, const GT0 &pot0_h, GT &phi_ff, GT2 &pot_phi_ff, bool potential_is_requested=true, bool use_field00=false, bool ignore_poloidal_dpot=false)
Definition: plane_field_gatherer.hpp:117
static int GPTLstop(const char *name)
Definition: timer_macro.hpp:10
Definition: get_potential_grad.hpp:53