xgca/html/plane__field__gatherer_8hpp_source.html

 #ifndef PLANE_FIELD_GATHERER_HPP

 #define PLANE_FIELD_GATHERER_HPP


 #include "domain_decomposition.hpp"

 #include "grid.hpp"

 #include "my_subview.hpp"

 #include "get_potential_grad.hpp"


 class PlaneFieldGatherer{

     const DomainDecomposition<DeviceType> pol_decomp;

     const Grid<DeviceType> grid;


     bool gather_subset;

     bool gather_near_field;

     int near_field_pid;

     int nplanes;

     int nnode;


     int tmp_nphi;

     View<Field<VarType::Scalar,PIT_GLOBAL>**, CLayout,HostType> tmp_s; // for scalar fields

     View<Field<VarType::Vector,PIT_GLOBAL>**, CLayout,HostType> tmp_v; // for vector fields


     View<Field<VarType::Scalar,PIT_GLOBAL>*, CLayout,HostType> tmp_s_full; // for scalar fields

     View<Field<VarType::Vector,PIT_GLOBAL>*, CLayout,HostType> tmp_v_full; // for vector fields


     /* USE_GPU without USE_GPU_AWARE_MPI requires that a temporary array be allocated

      */

     int choose_tmp_nphi(){

 #if defined(USE_GPU) && !defined(USE_GPU_AWARE_MPI)

         return nplanes;

 #else

         return 0;

 #endif

     }


     /* // Series of host MPI_Igatherv to populate phi_ff. An MPI_Allgatherv or other op (or intercommunicator?) might be possible/better.

      */

     template<typename T, typename FT>

     void allgather_to_local_ranks(View<FT*, CLayout,HostType>& tmp_full,T* destination){

 #ifdef USE_MPI

         int one_obj_in_dbl = sizeof(FT)/8; // from bytes to doubles


         View<int*, HostType> recvcounts(NoInit("recvcounts"), pol_decomp.mpi.n_intpl_ranks);

         View<int*, HostType> displs(NoInit("displs"),pol_decomp.mpi.n_intpl_ranks);

         std::vector<MPI_Request> rrequests(pol_decomp.mpi.n_intpl_ranks);


         // Loop over ranks in global inter-planar communicator, each taking turns as root

         for(int i_root=0; i_root<pol_decomp.mpi.n_intpl_ranks; i_root++){

             int root_local_rank = pol_decomp.field_decomp.map_from_global_intpl(i_root); // will range from e.g. 0 to 7

             int root_near_field_pid = pol_decomp.field_decomp.find_domain_owner(i_root, grid.nplanes, pol_decomp.node_offset, grid.nnode);


             // Gather from the ranks that hold the assigned planes, or from the ranks that hold the near field planes if gathering the near field

             int source_rank = gather_near_field ? root_near_field_pid : root_local_rank;


             int root_local_first_plane = pol_decomp.field_decomp.all_first_plane(source_rank);

             int root_local_last_plane = pol_decomp.field_decomp.all_last_plane(source_rank);

             int root_local_nplanes = positive_modulo(root_local_last_plane - root_local_first_plane,grid.nplanes) + 1;


             int root_local_first_node = pol_decomp.field_decomp.all_first_node(source_rank);

             int root_local_nnode = pol_decomp.field_decomp.all_last_node(source_rank) - pol_decomp.field_decomp.all_first_node(source_rank) + 1;


             int size = root_local_nnode * one_obj_in_dbl;


             // Now determine where each contributing rank will send its contribution

             // Initialize to zero

             Kokkos::deep_copy(recvcounts, 0);

             Kokkos::deep_copy(displs, 0);


             // Note that the data arrives OUT OF ORDER to handle the case where the ghost planes wrap around plane 0

             for(int i=0; i<root_local_nplanes; i++){

                 int contributor = (i+root_local_first_plane)%grid.nplanes;

                 recvcounts(contributor) = size;

                 displs(contributor) = i*size;

                 //if(i<displs.size()-1) displs(i+1) = displs(i) + recvcounts(contributor);

             }


             int my_size = recvcounts(pol_decomp.mpi.my_intpl_rank);


             MPI_Igatherv(tmp_full.data()+root_local_first_node, my_size, MPI_DOUBLE, destination, recvcounts.data(), displs.data(), MPI_DOUBLE, i_root, pol_decomp.mpi.intpl_comm, &(rrequests[i_root]));

         }

         // Wait for Igatherv to complete

         for(int i_root=0; i_root<pol_decomp.mpi.n_intpl_ranks; i_root++) MPI_Wait(&(rrequests[i_root]), MPI_STATUS_IGNORE);

 #endif

     }


     /* Copy the irho==0 index of a rho_ff field

      */

     template<typename FT>

     void copy_to_tmp_full(View<FT*, CLayout,HostType>& tmp_full, const View<FT**, CLayout,HostType>& rho_ff_h){

         constexpr int ZERO_GYRO = 0;


         // Check that the host array is allocated

         if(rho_ff_h.extent(0) != grid.nnode || rho_ff_h.extent(1)<1) exit_XGC("\nError in gather_phi_ff_on_device: expected host view of size (nnode,nrho)\n");


         // Copy this rank's contribution to tmp_full

         Kokkos::parallel_for("gather_field_info", Kokkos::RangePolicy<HostExSpace>( 0, grid.nnode), [=] (const int inode){

             tmp_full(inode)=rho_ff_h(inode,ZERO_GYRO);

         });

     }


     /* Copy a ff field

      */

     template<typename FT>

     void copy_to_tmp_full(View<FT*, CLayout,HostType>& tmp_full, const View<FT*, CLayout,HostType>& ff_h){

         // Check that the host array is allocated

         if(ff_h.extent(0) != grid.nnode) exit_XGC("\nError in gather_phi_ff_on_device: expected host view of size (nnode,nrho)\n");


         // Copy this rank's contribution to tmp_full

         Kokkos::parallel_for("gather_field_info", Kokkos::RangePolicy<HostExSpace>( 0, grid.nnode), [=] (const int inode){

             tmp_full(inode)=ff_h(inode);

         });

     }


     public:


     template<typename GT0, typename GT, typename GT2>

     void calculate_phi_ff_on_device(const Simulation<DeviceType>& sml, const Grid<DeviceType>& grid, const MagneticField<DeviceType>& magnetic_field, Smoothing& smoothing, GetPotentialGradTemp<DeviceType, DeviceType>& tmp, const View<Field<VarType::Scalar,PhiInterpType::None>*,CLayout,HostType>& dpot_h, const GT0& pot0_h, GT& phi_ff, GT2& pot_phi_ff, bool potential_is_requested=true, bool use_field00=false, bool ignore_poloidal_dpot=false){


         // Need 3 ghost planes for this algorithm

         int n_initial_ghost_planes = 2;

         int n_final_ghost_planes = 1;

         int n_ghost_planes = n_initial_ghost_planes + n_final_ghost_planes;


         GPTLstart("PRE_GATHER_POT");


         // Allocate phi_ff device view

         phi_ff = GT("gfpack_view",nplanes, nnode);


         // Make an unmanaged view to reuse the phi_ff allocation:

         auto* ptr_to_end_of_phi_ff = phi_ff.data()+phi_ff.size();

         int size_of_dpot_phi = (nplanes+n_ghost_planes)*nnode;

         double* dpot_phi_addr = (double*)(ptr_to_end_of_phi_ff) - size_of_dpot_phi;

         View<double**,CLayout,DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged>> dpot_phi(dpot_phi_addr, nplanes+n_ghost_planes, nnode);

         GPTLstop("PRE_GATHER_POT");


         GPTLstart("GATHER_POT");

 #ifdef USE_MPI

         // Gather the potential from all planes, leaving room at the beginning of the array for two ghost planes


         // Mirror views for when GPU-aware MPI is not available

         auto pot_tmp = my_mirror_view(dpot_h, MPIDeviceType());

         mirror_copy(pot_tmp, dpot_h);


         auto dpot_phi_MPI_d = my_mirror_scratch_view(dpot_phi,MPIDeviceType());

         auto dpot_phi_tmp = my_mirror_view(dpot_phi, MPIDeviceType(), dpot_phi_MPI_d);

         auto destination = dpot_phi_tmp.data() + nnode*n_initial_ghost_planes; // 2 initial ghost planes

         MPI_Allgather(pot_tmp.data(), nnode, MPI_DOUBLE, destination, nnode, MPI_DOUBLE, pol_decomp.mpi.intpl_comm);

         mirror_copy(dpot_phi, dpot_phi_tmp); // Send to device

 #endif

         GPTLstop("GATHER_POT");


         // Incoming view input_field has size (nplanes+3): 2 ghost planes at the beginning, 1 at the end


         // If the ghost planes wrap around, copy them

         // e.g. with 8 planes named Plane 0-7:

         // dpot_phi(0) is a copy of Plane 6

         // dpot_phi(1) is a copy of Plane 7

         // dpot_phi(10) is a copy of Plane 0

         int nplanes_l = nplanes;

         Kokkos::parallel_for("set_up_ghost_planes", Kokkos::RangePolicy<ExSpace>(0,grid.nnode), KOKKOS_LAMBDA( const int i ){

             dpot_phi(0,i) = dpot_phi(nplanes_l-2 + n_initial_ghost_planes, i);

             dpot_phi(1,i) = dpot_phi(nplanes_l-1 + n_initial_ghost_planes, i);

             dpot_phi(nplanes_l + n_initial_ghost_planes,i) = dpot_phi(0 + n_initial_ghost_planes, i);

         });

         Kokkos::fence();


         /**** Get potential gradient ****/

         GPTLstart("CALC_E_FROM_POT");


         GPTLstart("GET_POT_GRAD_PHI");

         // Allocate phi_ff device view

         if(potential_is_requested) pot_phi_ff = GT2("gfpack_potview",nplanes, nnode);


         auto gpg_field_args = GetPotGradFieldArgs<DeviceType, DeviceType, vec2d_if_axisym<PIT_GLOBAL>(), PIT_GLOBAL, TorType::MultiplePlanes, KinType::DriftKin>

                                             (dpot_phi, ignore_poloidal_dpot);


         // Set up E00

         if(use_field00){

             gpg_field_args.field00 = Field00<DeviceType>(pot0_h, sml.grad_psitheta);

             gpg_field_args.field00.calculate_gradient(grid, tmp.grad_matrices);

         }


         // Request potential and/or gradient outputs

         if(potential_is_requested) gpg_field_args.request_potential(pot_phi_ff);

         gpg_field_args.request_gradient(phi_ff);


         // Get requested fields

         get_field_grad(sml, grid, pol_decomp, magnetic_field, smoothing, gpg_field_args, tmp);


         GPTLstop("GET_POT_GRAD_PHI");

         GPTLstop("CALC_E_FROM_POT");

     }


     private:


     template<typename T_h, typename FT>

     void gather_phi_ff_on_device(View<FT**, CLayout,HostType>& tmp, View<FT*, CLayout,HostType>& tmp_full, const T_h& rho_ff_h, View<FT**, CLayout,DeviceType>& phi_ff){

 #if defined(USE_GPU) && !defined(USE_GPU_AWARE_MPI)

         constexpr bool gpu_without_gpu_aware_MPI = true;

 #else

         constexpr bool gpu_without_gpu_aware_MPI = false;

 #endif


         // Copy this rank's contribution to tmp_full

         copy_to_tmp_full(tmp_full, rho_ff_h);


         // Allocate phi_ff device view

         phi_ff = View<FT**, CLayout,DeviceType>("gfpack_view",nplanes, nnode);


         // If using GPUs without GPU-aware MPI, need to do MPI gather into a temporary host allocation

         auto destination = (gpu_without_gpu_aware_MPI ? tmp.data() : phi_ff.data());


         // Gather contributions from each rank; if we're just gathering a subset of the field, the logic is more complicated

         if(gather_subset){

             allgather_to_local_ranks(tmp_full, destination);

         }else{

 #ifdef USE_MPI

             int one_obj_in_dbl = sizeof(FT)/8; // from bytes to doubles

             int sz=nnode * one_obj_in_dbl;

             MPI_Allgather(tmp_full.data(), sz, MPI_DOUBLE, destination, sz, MPI_DOUBLE, pol_decomp.mpi.intpl_comm);

 #else

             // Used for mini_appKernel; just replicate the rho field for now

             if(gpu_without_gpu_aware_MPI)

                 for(int i=0; i<grid.nplanes; i++) Kokkos::deep_copy(my_subview(tmp, i), tmp_full);

             else

                 for(int i=0; i<grid.nplanes; i++) Kokkos::deep_copy(my_subview(phi_ff, i), tmp_full);

 #endif

         }

         // If using GPUs without GPU-aware MPI, copy from the temporary allocation to device memory

         if(gpu_without_gpu_aware_MPI){

             Kokkos::deep_copy(phi_ff, tmp);

         }

     }


     template<typename FT>

     inline View<FT**, CLayout,HostType>& which_tmp();


     template<typename FT>

     inline View<FT*, CLayout,HostType>& which_tmp_full();


     public:


     PlaneFieldGatherer(const DomainDecomposition<DeviceType>& pol_decomp, const Grid<DeviceType>& grid, bool near_field=false)

       : pol_decomp(pol_decomp),

         grid(grid),

         gather_subset(pol_decomp.decompose_fields),

         gather_near_field(near_field),

         near_field_pid((gather_subset && near_field) ? pol_decomp.field_decomp.find_domain_owner(pol_decomp.plane_index, grid.nplanes, pol_decomp.node_offset, grid.nnode) : 0),

         // If using field_decomposition, we don't copy the entire field

         nplanes(     gather_subset ? (near_field ? pol_decomp.field_decomp.all_n_planes   (near_field_pid, grid.nplanes) : pol_decomp.field_decomp.n_planes   ) : grid.nplanes),

         nnode(       gather_subset ? (near_field ? pol_decomp.field_decomp.all_n_nodes    (near_field_pid)               : pol_decomp.field_decomp.n_nodes    ) : grid.nnode),

         tmp_nphi(choose_tmp_nphi()),

         tmp_s("tmp_s", tmp_nphi, nnode),

         tmp_v("tmp_v", tmp_nphi, nnode),

         tmp_s_full("tmp_s_full", grid.nnode),

         tmp_v_full("tmp_v_full", grid.nnode)

     {}


     // Interface to use correct tmp

     template<typename T_h, typename FT>

     void gather_phi_ff_on_device(const T_h& rho_ff_h, View<FT**, CLayout,DeviceType>& phi_ff){

         gather_phi_ff_on_device(which_tmp<FT>(), which_tmp_full<FT>(), rho_ff_h, phi_ff);

     }

 };


 template<>

 inline View<Field<VarType::Scalar,PIT_GLOBAL>**, CLayout,HostType>& PlaneFieldGatherer::which_tmp(){

     return tmp_s;

 }


 template<>

 inline View<Field<VarType::Vector,PIT_GLOBAL>**, CLayout,HostType>& PlaneFieldGatherer::which_tmp(){

     return tmp_v;

 }


 template<>

 inline View<Field<VarType::Scalar,PIT_GLOBAL>*, CLayout,HostType>& PlaneFieldGatherer::which_tmp_full(){

     return tmp_s_full;

 }


 template<>

 inline View<Field<VarType::Vector,PIT_GLOBAL>*, CLayout,HostType>& PlaneFieldGatherer::which_tmp_full(){

     return tmp_v_full;

 }


 #endif

PlaneFieldGatherer::gather_subset
bool gather_subset
Definition: plane_field_gatherer.hpp:13

GPTLstart
static int GPTLstart(const char *name)
Definition: timer_macro.hpp:9

PlaneFieldGatherer::tmp_s_full
View< Field< VarType::Scalar, PIT_GLOBAL > *, CLayout, HostType > tmp_s_full
Definition: plane_field_gatherer.hpp:23

FieldDecomposition::all_last_node
View< int *, CLayout, HostType > all_last_node
Last node of each rank.
Definition: field_decomposition.hpp:42

get_potential_grad.hpp

mirror_copy
void mirror_copy(T1 &view_dest, const T2 &view_src)
Definition: my_mirror_view.hpp:122

Field< VarType::Scalar, PhiInterpType::None >
Definition: field.hpp:301

PlaneFieldGatherer::nnode
int nnode
Definition: plane_field_gatherer.hpp:17

PlaneFieldGatherer::gather_phi_ff_on_device
void gather_phi_ff_on_device(const T_h &rho_ff_h, View< FT **, CLayout, DeviceType > &phi_ff)
Definition: plane_field_gatherer.hpp:261

Simulation
Definition: sml.hpp:8

PlaneFieldGatherer::which_tmp
View< FT **, CLayout, HostType > & which_tmp()

HostType
Kokkos::Device< HostExSpace, HostMemSpace > HostType
Definition: space_settings.hpp:57

DriftKin
Definition: globals.hpp:89

PlaneFieldGatherer
Definition: plane_field_gatherer.hpp:9

FieldDecomposition::find_domain_owner
KOKKOS_INLINE_FUNCTION int find_domain_owner(int global_plane_index, int nplanes_total, int global_node_index, int nnodes_total) const
Definition: field_decomposition.hpp:136

MagneticField
Definition: magnetic_field.hpp:12

GetPotGradFieldArgs
Definition: get_potential_grad.hpp:276

FieldDecomposition::map_from_global_intpl
View< int *, CLayout, HostType > map_from_global_intpl
Rank in this communicator for each rank global intpl.
Definition: field_decomposition.hpp:40

get_field_grad
void get_field_grad(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const DomainDecomposition< DeviceType > &pol_decomp, const MagneticField< DeviceType > &magnetic_field, Smoothing &smoothing, GetPotGradFieldArgs< DeviceIn, DeviceOut, VT, PIT, TT, KT > &args, GetPotentialGradTemp< DeviceType, DeviceOut > &tmp)
Definition: get_potential_grad.cpp:395

Grid< DeviceType >

PlaneFieldGatherer::tmp_v_full
View< Field< VarType::Vector, PIT_GLOBAL > *, CLayout, HostType > tmp_v_full
Definition: plane_field_gatherer.hpp:24

Grid::nplanes
int nplanes
Number of planes.
Definition: grid.hpp:162

Field00< DeviceType >

DomainDecomposition::node_offset
int node_offset
Offset of first mesh node belonging to this MPI rank.
Definition: domain_decomposition.hpp:59

CLayout
Kokkos::LayoutRight CLayout
Definition: space_settings.hpp:68

PlaneFieldGatherer::choose_tmp_nphi
int choose_tmp_nphi()
Definition: plane_field_gatherer.hpp:28

GetPotentialGradTemp::grad_matrices
GradientMatrices< DeviceType > grad_matrices
Definition: get_potential_grad.hpp:67

TorType::MultiplePlanes

PlaneFieldGatherer::which_tmp_full
View< FT *, CLayout, HostType > & which_tmp_full()

positive_modulo
KOKKOS_INLINE_FUNCTION unsigned positive_modulo(int value, unsigned m)
Definition: globals.hpp:211

Simulation::grad_psitheta
bool grad_psitheta
Definition: sml.hpp:99

PlaneFieldGatherer::copy_to_tmp_full
void copy_to_tmp_full(View< FT *, CLayout, HostType > &tmp_full, const View< FT *, CLayout, HostType > &ff_h)
Definition: plane_field_gatherer.hpp:104

PlaneFieldGatherer::tmp_s
View< Field< VarType::Scalar, PIT_GLOBAL > **, CLayout, HostType > tmp_s
Definition: plane_field_gatherer.hpp:20

my_subview.hpp

PIT_GLOBAL
constexpr PhiInterpType PIT_GLOBAL
Definition: globals.hpp:105

grid.hpp

PlaneFieldGatherer::grid
const Grid< DeviceType > grid
Definition: plane_field_gatherer.hpp:11

PlaneFieldGatherer::pol_decomp
const DomainDecomposition< DeviceType > pol_decomp
Definition: plane_field_gatherer.hpp:10

Smoothing
Definition: smoothing.hpp:8

PlaneFieldGatherer::gather_phi_ff_on_device
void gather_phi_ff_on_device(View< FT **, CLayout, HostType > &tmp, View< FT *, CLayout, HostType > &tmp_full, const T_h &rho_ff_h, View< FT **, CLayout, DeviceType > &phi_ff)
Definition: plane_field_gatherer.hpp:197

my_mirror_scratch_view
View< T *, CLayout, Device > my_mirror_scratch_view(const View< T *, CLayout, Device, Kokkos::MemoryTraits< Kokkos::Unmanaged >> &view, Device nd)
Definition: my_mirror_view.hpp:97

PlaneFieldGatherer::PlaneFieldGatherer
PlaneFieldGatherer(const DomainDecomposition< DeviceType > &pol_decomp, const Grid< DeviceType > &grid, bool near_field=false)
Definition: plane_field_gatherer.hpp:243

PlaneFieldGatherer::gather_near_field
bool gather_near_field
Definition: plane_field_gatherer.hpp:14

FieldDecomposition::all_first_plane
View< int *, CLayout, HostType > all_first_plane
First plane of each rank.
Definition: field_decomposition.hpp:43

my_mirror_view
View< T *, CLayout, Device > my_mirror_view(const View< T *, CLayout, Device > &view, Device nd)
Definition: my_mirror_view.hpp:14

PlaneFieldGatherer::allgather_to_local_ranks
void allgather_to_local_ranks(View< FT *, CLayout, HostType > &tmp_full, T *destination)
Definition: plane_field_gatherer.hpp:39

exit_XGC
void exit_XGC(std::string msg)
Definition: globals.hpp:37

PlaneFieldGatherer::tmp_nphi
int tmp_nphi
Definition: plane_field_gatherer.hpp:19

my_subview
Kokkos::View< T *, Kokkos::LayoutRight, Device > my_subview(const Kokkos::View< T ****, Kokkos::LayoutRight, Device > &view, int i, int j, int k)
Definition: my_subview.hpp:8

FieldDecomposition::all_last_plane
View< int *, CLayout, HostType > all_last_plane
Last plane of each rank.
Definition: field_decomposition.hpp:44

PlaneFieldGatherer::nplanes
int nplanes
Definition: plane_field_gatherer.hpp:16

magnetic_field
Definition: magnetic_field.F90:1

PlaneFieldGatherer::near_field_pid
int near_field_pid
Definition: plane_field_gatherer.hpp:15

DomainDecomposition< DeviceType >

PlaneFieldGatherer::tmp_v
View< Field< VarType::Vector, PIT_GLOBAL > **, CLayout, HostType > tmp_v
Definition: plane_field_gatherer.hpp:21

DomainDecomposition::field_decomp
FieldDecomposition< Device > field_decomp
Definition: domain_decomposition.hpp:56

Streamed::parallel_for
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252

domain_decomposition.hpp

MPIDeviceType
HostType MPIDeviceType
Definition: space_settings.hpp:63

Field00::calculate_gradient
void calculate_gradient(const Grid< DeviceType > &grid, const GradientMatrices< DeviceType > &grad_matrices)
Definition: get_potential_grad.hpp:142

Grid::nnode
int nnode
Number of grid nodes.
Definition: grid.hpp:161

PlaneFieldGatherer::copy_to_tmp_full
void copy_to_tmp_full(View< FT *, CLayout, HostType > &tmp_full, const View< FT **, CLayout, HostType > &rho_ff_h)
Definition: plane_field_gatherer.hpp:89

FieldDecomposition::all_first_node
View< int *, CLayout, HostType > all_first_node
First node of each rank.
Definition: field_decomposition.hpp:41

NoInit
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:69

PlaneFieldGatherer::calculate_phi_ff_on_device
void calculate_phi_ff_on_device(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, Smoothing &smoothing, GetPotentialGradTemp< DeviceType, DeviceType > &tmp, const View< Field< VarType::Scalar, PhiInterpType::None > *, CLayout, HostType > &dpot_h, const GT0 &pot0_h, GT &phi_ff, GT2 &pot_phi_ff, bool potential_is_requested=true, bool use_field00=false, bool ignore_poloidal_dpot=false)
Definition: plane_field_gatherer.hpp:117

GPTLstop
static int GPTLstop(const char *name)
Definition: timer_macro.hpp:10

GetPotentialGradTemp
Definition: get_potential_grad.hpp:46