1 #ifndef PLANE_FIELD_GATHERER_HPP
2 #define PLANE_FIELD_GATHERER_HPP
29 #if defined(USE_GPU) && !defined(USE_GPU_AWARE_MPI)
38 template<
typename T,
typename FT>
41 int one_obj_in_dbl =
sizeof(FT)/8;
43 View<int*, HostType> recvcounts(
NoInit(
"recvcounts"),
pol_decomp.mpi.n_intpl_ranks);
45 std::vector<MPI_Request> rrequests(
pol_decomp.mpi.n_intpl_ranks);
48 for(
int i_root=0; i_root<
pol_decomp.mpi.n_intpl_ranks; i_root++){
56 int size = root_local_nnode * one_obj_in_dbl;
60 Kokkos::deep_copy(recvcounts, 0);
61 Kokkos::deep_copy(displs, 0);
64 for(
int i=0; i<root_local_nplanes; i++){
65 int contributor = (i+root_local_first_plane)%
grid.
nplanes;
66 recvcounts(contributor) = size;
67 displs(contributor) = i*size;
71 int my_size = recvcounts(
pol_decomp.mpi.my_intpl_rank);
94 MPI_Igatherv(tmp_full.data()+root_local_first_node, my_size, MPI_DOUBLE, destination, recvcounts.data(), displs.data(), MPI_DOUBLE, i_root,
pol_decomp.mpi.intpl_comm, &(rrequests[i_root]));
100 for(
int i_root=0; i_root<
pol_decomp.mpi.n_intpl_ranks; i_root++) MPI_Wait(&(rrequests[i_root]), MPI_STATUS_IGNORE);
106 template<
typename FT>
107 void copy_to_tmp_full(View<FT*, CLayout,HostType>& tmp_full,
const View<FT**, CLayout,HostType>& rho_ff_h){
108 constexpr
int ZERO_GYRO = 0;
111 if(rho_ff_h.extent(0) !=
grid.
nnode || rho_ff_h.extent(1)<1)
exit_XGC(
"\nError in gather_phi_ff_on_device: expected host view of size (nnode,nrho)\n");
115 tmp_full(inode)=rho_ff_h(inode,ZERO_GYRO);
121 template<
typename FT>
122 void copy_to_tmp_full(View<FT*, CLayout,HostType>& tmp_full,
const View<FT*, CLayout,HostType>& ff_h){
124 if(ff_h.extent(0) != grid.nnode)
exit_XGC(
"\nError in gather_phi_ff_on_device: expected host view of size (nnode,nrho)\n");
127 Kokkos::parallel_for(
"gather_field_info", Kokkos::RangePolicy<HostExSpace>( 0, grid.nnode), [=] (
const int inode){
128 tmp_full(inode)=ff_h(inode);
132 template<
typename T_h,
typename FT>
133 void gather_phi_ff_on_device(View<FT**, CLayout,HostType>& tmp, View<FT*, CLayout,HostType>& tmp_full,
const T_h& rho_ff_h, View<FT**, CLayout,DeviceType>& phi_ff){
134 #if defined(USE_GPU) && !defined(USE_GPU_AWARE_MPI)
135 constexpr
bool gpu_without_gpu_aware_MPI =
true;
137 constexpr
bool gpu_without_gpu_aware_MPI =
false;
141 copy_to_tmp_full(tmp_full, rho_ff_h);
144 phi_ff = View<FT**, CLayout,DeviceType>(
"gfpack_view",nplanes, nnode);
147 auto destination = (gpu_without_gpu_aware_MPI ? tmp.data() : phi_ff.data());
165 allgather_to_local_ranks(tmp_full, destination);
168 int one_obj_in_dbl =
sizeof(FT)/8;
169 int sz=nnode * one_obj_in_dbl;
170 MPI_Allgather(tmp_full.data(), sz, MPI_DOUBLE, destination, sz, MPI_DOUBLE, pol_decomp.mpi.intpl_comm);
173 if(gpu_without_gpu_aware_MPI)
174 for(
int i=0; i<grid.nplanes; i++) Kokkos::deep_copy(
my_subview(tmp, i), tmp_full);
176 for(
int i=0; i<grid.nplanes; i++) Kokkos::deep_copy(
my_subview(phi_ff, i), tmp_full);
181 if(gpu_without_gpu_aware_MPI){
182 Kokkos::deep_copy(phi_ff, tmp);
187 template<
typename FT>
188 inline View<FT**, CLayout,HostType>& which_tmp();
190 template<
typename FT>
191 inline View<FT*, CLayout,HostType>& which_tmp_full();
196 : pol_decomp(pol_decomp),
198 gather_subset(pol_decomp.decompose_fields),
200 nplanes(gather_subset ? pol_decomp.field_decomp.n_planes : grid.nplanes),
201 plane_offset(gather_subset ? pol_decomp.field_decomp.first_plane : 0),
202 nnode(gather_subset ? pol_decomp.field_decomp.n_nodes : grid.nnode),
203 node_offset(gather_subset ? pol_decomp.field_decomp.first_node : 0),
204 tmp_nphi(choose_tmp_nphi()),
205 tmp_s(
"tmp_s", tmp_nphi, nnode),
206 tmp_v(
"tmp_v", tmp_nphi, nnode),
207 tmp_s_full(
"tmp_s_full", grid.nnode),
208 tmp_v_full(
"tmp_v_full", grid.nnode)
215 template<
typename T_h,
typename FT>
217 gather_phi_ff_on_device(which_tmp<FT>(), which_tmp_full<FT>(), rho_ff_h, phi_ff);
View< Field< VarType::Scalar, PhiInterpType::Planes > **, CLayout, HostType > tmp_s
Definition: plane_field_gatherer.hpp:20
bool gather_subset
Definition: plane_field_gatherer.hpp:13
View< int *, CLayout, HostType > all_last_node
Last node of each rank.
Definition: field_decomposition.hpp:42
int nnode
Definition: plane_field_gatherer.hpp:16
void gather_phi_ff_on_device(const T_h &rho_ff_h, View< FT **, CLayout, DeviceType > &phi_ff)
Definition: plane_field_gatherer.hpp:216
View< FT **, CLayout, HostType > & which_tmp()
Kokkos::Device< HostExSpace, HostMemSpace > HostType
Definition: space_settings.hpp:56
Definition: plane_field_gatherer.hpp:9
View< Field< VarType::Scalar, PhiInterpType::Planes > *, CLayout, HostType > tmp_s_full
Definition: plane_field_gatherer.hpp:23
View< int *, CLayout, HostType > map_from_global_intpl
Rank in this communicator for each rank global intpl.
Definition: field_decomposition.hpp:40
int nplanes
Number of planes.
Definition: grid.hpp:84
Kokkos::LayoutRight CLayout
Definition: space_settings.hpp:60
int choose_tmp_nphi()
Definition: plane_field_gatherer.hpp:28
Kokkos::View< T *, Kokkos::LayoutRight, Device > my_subview(const Kokkos::View< T ***, Kokkos::LayoutRight, Device > &view, int i, int j)
Definition: my_subview.hpp:8
View< FT *, CLayout, HostType > & which_tmp_full()
KOKKOS_INLINE_FUNCTION unsigned positive_modulo(int value, unsigned m)
Definition: globals.hpp:131
void copy_to_tmp_full(View< FT *, CLayout, HostType > &tmp_full, const View< FT *, CLayout, HostType > &ff_h)
Definition: plane_field_gatherer.hpp:122
const Grid< DeviceType > grid
Definition: plane_field_gatherer.hpp:11
const DomainDecomposition< DeviceType > pol_decomp
Definition: plane_field_gatherer.hpp:10
void gather_phi_ff_on_device(View< FT **, CLayout, HostType > &tmp, View< FT *, CLayout, HostType > &tmp_full, const T_h &rho_ff_h, View< FT **, CLayout, DeviceType > &phi_ff)
Definition: plane_field_gatherer.hpp:133
View< Field< VarType::Vector, PhiInterpType::Planes > **, CLayout, HostType > tmp_v
Definition: plane_field_gatherer.hpp:21
View< int *, CLayout, HostType > all_first_plane
First plane of each rank.
Definition: field_decomposition.hpp:43
View< Field< VarType::Vector, PhiInterpType::Planes > *, CLayout, HostType > tmp_v_full
Definition: plane_field_gatherer.hpp:24
void allgather_to_local_ranks(View< FT *, CLayout, HostType > &tmp_full, T *destination)
Definition: plane_field_gatherer.hpp:39
void exit_XGC(std::string msg)
Definition: globals.hpp:36
int tmp_nphi
Definition: plane_field_gatherer.hpp:19
View< int *, CLayout, HostType > all_last_plane
Last plane of each rank.
Definition: field_decomposition.hpp:44
int nplanes
Definition: plane_field_gatherer.hpp:14
PlaneFieldGatherer(const DomainDecomposition< DeviceType > &pol_decomp, const Grid< DeviceType > &grid)
Definition: plane_field_gatherer.hpp:195
FieldDecomposition< Device > field_decomp
Definition: domain_decomposition.hpp:35
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252
int nnode
Number of grid nodes.
Definition: grid.hpp:83
void copy_to_tmp_full(View< FT *, CLayout, HostType > &tmp_full, const View< FT **, CLayout, HostType > &rho_ff_h)
Definition: plane_field_gatherer.hpp:107
View< int *, CLayout, HostType > all_first_node
First node of each rank.
Definition: field_decomposition.hpp:41
int plane_offset
Definition: plane_field_gatherer.hpp:15
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:61
int node_offset
Definition: plane_field_gatherer.hpp:17