xgca/html/transfer__vertex__data_8hpp_source.html

 #ifndef TRANSFER_VERTEX_DATA_HPP

 #define TRANSFER_VERTEX_DATA_HPP


 #include "domain_decomposition.hpp"


 inline int pair(int np, int p, int k){

     int out = p ^ k;

     return (out<np ? out : -1);

 }


 template<typename T>

 inline int n_doubles_per_vertex(const T& array);


 // 1D view has 1 value per vertex

 template<>

 inline int n_doubles_per_vertex(const View<double*,CLayout,HostType>& array){

     return 1;

 }


 // 2D view has extent(1) values per vertex

 template<>

 inline int n_doubles_per_vertex(const View<double**,CLayout,HostType>& array){

     return array.extent(1);

 }


 // VGridDistribution has species*vr*vz values per vertex

 template<>

 inline int n_doubles_per_vertex(const VGridDistribution<HostType>& array){

     return array.n_species()*array.n_vr()*array.n_vz();

 }


 // std::vector<View<double*,CLayout,HostType>> has array.size()

 template<>

 inline int n_doubles_per_vertex(const std::vector<View<double*,CLayout,HostType>>& array){

     return array.size();

 }


 /* size_sum is used to determine the necessary buffer size when using the constructor that takes a template pack of Views

  * The final template has only one View left, so returns the size of that View.

  */

 template<class V>

 inline int total_n_doubles_per_vertex(const V& first) {

     return n_doubles_per_vertex(first);

 }


 /* size_sum is used to determine the necessary buffer size when using the constructor that takes a template pack of Views

  * The generic template adds the first View in the template pack, then returns the size_sum of the remaining Views in the pack

  */

 template<class V, class... TRest>

 inline int total_n_doubles_per_vertex(const V& first, const TRest&... args) {

     return n_doubles_per_vertex(first) + total_n_doubles_per_vertex(args...);

 }


 // Access values of a vertex i of a datatype

 template<typename T>

 KOKKOS_INLINE_FUNCTION double& vertex_access(const T& array, int i, int ip);


 // 1D view: The single index is assumed to be the vertex index

 template<>

 KOKKOS_INLINE_FUNCTION double& vertex_access(const View<double*,CLayout,HostType>& array, int i, int ip){

     return array(i);

 }


 // 2D view: The first index is assumed to be the vertex index

 template<>

 KOKKOS_INLINE_FUNCTION double& vertex_access(const View<double**,CLayout,HostType>& array, int i, int ip){

     return array(i,ip);

 }


 // VGridDistribution: Use the function VGridDistribution::pull_node_index for correct access

 template<>

 KOKKOS_INLINE_FUNCTION double& vertex_access(const VGridDistribution<HostType>& array, int i, int ip){

     return array.pull_node_index(i, ip);

 }


 // Resize a data type to a new number of vertices

 template<typename T>

 inline void resize_n_vertices(int nnodes, T& array);


 template<typename T, typename... TRest>

 inline void resize_n_vertices(int nnodes, T& array, TRest&... args){

     resize_n_vertices(nnodes, array);

     resize_n_vertices(nnodes, args...);

 }


 // 1D view

 template<>

 inline void resize_n_vertices(int nnodes, View<double*,CLayout,HostType>& array){

     array = View<double*,CLayout,HostType>(NoInit(array.label()), nnodes);

 }


 // 2D view

 template<>

 inline void resize_n_vertices(int nnodes, View<double**,CLayout,HostType>& array){

     array = View<double**,CLayout,HostType>(NoInit(array.label()), nnodes, array.extent(1));

 }


 // VGridDistribution

 template<>

 inline void resize_n_vertices(int nnodes, VGridDistribution<HostType>& array){

     array.resize_n_vertices(nnodes);

 }


 // Load a single array to a buffer. The buffer is size (nvertices, nvals_per_vertex)

 // n is the number of vertices being unloaded

 // new_offset is where to write the array relative to the vertex range of the buffer (first dimension)

 // old_offset is where to read the array relative to the vertex range of the array

 // buffer is the buffer being written to

 // arr_offset is where the values for the array are stored inside buffer (second dimension)

 // array is the array being read from

 template<class V>

 inline void load_arrays(int n, int new_offset, int old_offset, const View<double**,CLayout,HostType>& buffer, int& arr_offset, const V& array) {

     int n_vals = n_doubles_per_vertex(array);

     Kokkos::parallel_for("pol_decomp_redist_load", Kokkos::RangePolicy<typename V::execution_space>(0, n), KOKKOS_LAMBDA(const int i){

         int inew = i + new_offset;

         int iold = i + old_offset;

         for(int ip = 0; ip<n_vals; ip++){

             buffer(inew,ip+arr_offset) = vertex_access(array, iold, ip);

         }

     });

     Kokkos::fence();

     arr_offset += n_vals;

 }


 // Load multiple arrays by unloading the first in the pack, then calling this function recursively

 template<class V, class... TRest>

 inline void load_arrays(int n, int new_offset, int old_offset, const View<double**,CLayout,HostType>& buffer, int& arr_offset, const V& first, const TRest&... args) {

     load_arrays(n, new_offset, old_offset, buffer, arr_offset, first);

     load_arrays(n, new_offset, old_offset, buffer, arr_offset, args...);

 }


 // Specialize for std::vector<View<double*,CLayout,HostType>>& array

 template<>

 inline void load_arrays(int n, int new_offset, int old_offset, const View<double**,CLayout,HostType>& buffer, int& arr_offset, const std::vector<View<double*,CLayout,HostType>>& array) {

     int n_vals = n_doubles_per_vertex(array);

     for(int ip=0; ip<n_vals; ip++){

         auto view = array[ip];

         Kokkos::parallel_for("pol_decomp_redist_load", Kokkos::RangePolicy<HostExSpace>(0, n), KOKKOS_LAMBDA(const int i){

             int inew = i + new_offset;

             int iold = i + old_offset;

             buffer(inew,ip+arr_offset) = view(iold);

         });

     }

     Kokkos::fence();

     arr_offset += n_vals;

 }


 // Load the entire buffer. Cannot use unload_arrays directly because arr_offset needs to be initialized to 0.

 template<class... Vs>

 inline void load_buffer(int n, int new_offset, int old_offset, const View<double**,CLayout,HostType>& buffer, const Vs&... arrays){

     if(n<=0) return;


     int arr_offset = 0;

     load_arrays(n, new_offset, old_offset, buffer, arr_offset, arrays...);

 }


 // Unload a single array from a buffer. The buffer is size (nvertices, nvals_per_vertex)

 // n is the number of vertices being unloaded

 // buffer is the buffer being read from

 // arr_offset is where the values for the array are stored inside buffer (second dimension)

 // array is the array being written to

 template<class V>

 inline void unload_arrays(int n, int vert_offset, const View<double**,CLayout,HostType>& buffer, int& arr_offset, const V& array) {

     int n_vals = n_doubles_per_vertex(array);

     Kokkos::parallel_for("pol_decomp_redist_unload", Kokkos::RangePolicy<typename V::execution_space>(0, n), KOKKOS_LAMBDA(const int i){

         for(int ip = 0; ip<n_vals; ip++){

             vertex_access(array, i+vert_offset, ip) = buffer(i,ip+arr_offset);

         }

     });

     Kokkos::fence();

     arr_offset += n_vals;

 }


 // Unload multiple arrays by unloading the first in the pack, then calling this function recursively

 template<class V, class... TRest>

 inline void unload_arrays(int n, int vert_offset, const View<double**,CLayout,HostType>& buffer, int& arr_offset, const V& first, const TRest&... args) {

     unload_arrays(n, vert_offset, buffer, arr_offset, first);

     unload_arrays(n, vert_offset, buffer, arr_offset, args...);

 }


 // Specific specialization std::vector<View<double*,CLayout,HostType>>

 template<>

 inline void unload_arrays(int n, int vert_offset, const View<double**,CLayout,HostType>& buffer, int& arr_offset, const std::vector<View<double*,CLayout,HostType>>& array) {

     int n_vals = n_doubles_per_vertex(array);

     for(int ip=0; ip<n_vals; ip++){

         auto view = array[ip];

         Kokkos::parallel_for("pol_decomp_redist_unload", Kokkos::RangePolicy<HostExSpace>(0, n), KOKKOS_LAMBDA(const int i){

             view(i+vert_offset) = buffer(i,ip+arr_offset);

         });

     }

     Kokkos::fence();

     arr_offset += n_vals;

 }


 // Count number of recvs. If not too many, prepost all receive requests (using XOR ordering and flow control)

 inline bool do_prepost_receive_requests(const DistributionPlan& plan){

     int n_recv = 0;

     for(int i=0;i<plan.cnts.size();i++){

         if(plan.cnts(i)>0 && i!=plan.my_rank){ // Skip my_rank because those vertices are not received from elsewhere

             n_recv+=1;

         }

     }

     constexpr int MAXPREPOSTS = 20;   // nothing magical to this number - just be reasonable

     return (n_recv <= MAXPREPOSTS);

 }


 inline int get_max_buf_size(const DistributionPlan& plan){

     int max_buf_size = 1; // Minimum size 1 (0 probably ok)

     for(int i=0;i<plan.cnts.size();i++){

         if(i!=plan.my_rank){ // Skip my_rank because those vertices are not being sent anywhere

             max_buf_size = std::max(max_buf_size, plan.cnts(i));

         }

     }

     return max_buf_size;

 }


 inline int get_sum_counts(const DistributionPlan& plan){

     return sum_view(plan.cnts);

 }


 template<class Device>

 struct VertexBuffer{

     View<double**,CLayout,Device> view;

     std::vector<MPI_Request> rrequests;

     View<bool*,CLayout,HostType> rreq_assigned;


     std::vector<View<double**,CLayout,Device>> send_buffers;

     std::vector<MPI_Request> srequests;

     View<bool*,CLayout,HostType> sreq_assigned;


     VertexBuffer(){}


     VertexBuffer(int nnode_in, int n_dbl_per_vertex, int n_ranks)

       : view(NoInit("tmp_redistribute"), nnode_in, n_dbl_per_vertex),

         rrequests(n_ranks),

         rreq_assigned(NoInit("rreq_assigned"), n_ranks),

         send_buffers(n_ranks),

         srequests(n_ranks),

         sreq_assigned(NoInit("sreq_assigned"), n_ranks)

     {

         Kokkos::deep_copy(rreq_assigned, false);

         Kokkos::deep_copy(sreq_assigned, false);

     }


     // Unload the entire buffer

     template<class... Vs>

     void unload(int vertex_offset, const Vs&... arrays) const{

         if(nnode()<=0) return;


         int arr_offset = 0;

         unload_arrays(nnode(), vertex_offset, view, arr_offset, arrays...);

     }


     int nnode() const{

         return view.extent(0);

     }


     void await_recvs(){

         for(int i=0; i<rrequests.size(); i++){

             if(rreq_assigned(i)){

                 MPI_Wait(&(rrequests[i]), MPI_STATUS_IGNORE);

             }

         }

     }


     void await_sends(){

         for(int i=0; i<srequests.size(); i++){

             if(sreq_assigned(i)){

                 MPI_Wait(&(srequests[i]), MPI_STATUS_IGNORE);


                 // Can release send buffer now that sending is confirmed complete

                 send_buffers[i] = View<double**,CLayout,Device>();

             }

         }

     }

 };


 template<class... Vs>

 inline VertexBuffer<HostType> transfer_data(const DistributionPlan& send_plan, const DistributionPlan& recv_plan, const MyMPI& mpi, bool async, const Vs&... arrays){

     GPTLstart("TRANSFER_DATA_SETUP");

     int n_dbl_per_vertex = total_n_doubles_per_vertex(arrays...);

     int new_nnodes = get_sum_counts(recv_plan);

     MPI_Comm comm = mpi.plane_comm;

     int my_rank = mpi.my_plane_rank;

     int n_ranks = mpi.n_plane_ranks;

     GPTLstop("TRANSFER_DATA_SETUP");


     GPTLstart("TRANSFER_DATA_BUFFER_SETUP");

     VertexBuffer<HostType> vertex_buffer(new_nnodes, n_dbl_per_vertex, n_ranks);

     GPTLstop("TRANSFER_DATA_BUFFER_SETUP");


     constexpr int MSG_TAG = 5; // Arbitrary

     constexpr int SIGNAL_TAG = 10; // Arbitrary


     // Count number of recvs. If not too many, prepost all receive requests (using XOR ordering and flow control)

     GPTLstart("TRANSFER_DATA_PREPOST_SETUP");

     const bool prepost_receive_requests = do_prepost_receive_requests(recv_plan);

     GPTLstop("TRANSFER_DATA_PREPOST_SETUP");


     GPTLstart("TRANSFER_DATA_PREPOST_IRECV");

     int p=1; while ( p < n_ranks ) p*=2; // get next power of 2 above n_ranks

     if(prepost_receive_requests){

         for(int i=1;i<p;i++){

             int pid = pair(n_ranks,i,my_rank);

             if (pid >= 0){

                 if (recv_plan.cnts(pid) > 0){

                     double* recv_addr = &vertex_buffer.view(recv_plan.displs(pid),0);

                     MPI_Irecv(recv_addr, recv_plan.cnts(pid)*n_dbl_per_vertex, MPI_DOUBLE, pid, MSG_TAG, comm, &(vertex_buffer.rrequests[pid]));

                     vertex_buffer.rreq_assigned(pid) = true;

                     if(!async){

                         int rsignal;

                         MPI_Send(&rsignal, 1, MPI_INTEGER, pid, SIGNAL_TAG, comm);

                     }

                 }

             }

         }

     }

     GPTLstop("TRANSFER_DATA_PREPOST_IRECV");


     GPTLstart("TRANSFER_DATA_LOAD_STAY_BUFFER");

     // Copy what is staying on this rank

     load_buffer(send_plan.cnts(my_rank), recv_plan.displs(my_rank), send_plan.displs(my_rank), vertex_buffer.view, arrays...);

     GPTLstop("TRANSFER_DATA_LOAD_STAY_BUFFER");


     // Copy information that stays on the same process

     // send/recv data using XOR ordering and flow control

     GPTLstart("TRANSFER_DATA_ALLOC_SENDARR_SH");

     View<double**,CLayout,HostType> sendarr_shared;

     if(!async){

         int n_sendarr = get_max_buf_size(send_plan);

         sendarr_shared = View<double**,CLayout,HostType>(NoInit("sendarr_shared"), n_sendarr, n_dbl_per_vertex);

     }

     GPTLstop("TRANSFER_DATA_ALLOC_SENDARR_SH");


     for(int i=1;i<p;i++){

         int pid = pair(n_ranks,i,my_rank);

         if (pid >= 0){

             if(!prepost_receive_requests){

                 GPTLstart("TRANSFER_DATA_IRECV");

                 if(recv_plan.cnts(pid) > 0){

                     // Post receive request

                     double* recv_addr = &vertex_buffer.view(recv_plan.displs(pid),0);

                     MPI_Irecv(recv_addr, recv_plan.cnts(pid)*n_dbl_per_vertex, MPI_DOUBLE, pid, MSG_TAG, comm, &(vertex_buffer.rrequests[pid]));

                     vertex_buffer.rreq_assigned(pid) = true;

                     if(!async){

                         int rsignal;

                         MPI_Send(&rsignal, 1, MPI_INTEGER, pid, SIGNAL_TAG, comm);

                     }

                 }

                 GPTLstop("TRANSFER_DATA_IRECV");

             }


             if (send_plan.cnts(pid) > 0){

                 View<double**,CLayout,HostType> sendarr;

                 if(async){

                     GPTLstart("TRANSFER_DATA_ALLOC_SENDARR");

                     // Hang onto allocations if not using a blocking send

                     vertex_buffer.send_buffers[pid] = View<double**,CLayout,HostType>(NoInit("sendarr"), send_plan.cnts(pid), n_dbl_per_vertex);

                     sendarr = vertex_buffer.send_buffers[pid];

                     GPTLstop("TRANSFER_DATA_ALLOC_SENDARR");

                 }else{

                     GPTLstart("TRANSFER_DATA_SENDARR_SUB");

                     // Use the shared buffer since the data is sent before its next use

                     sendarr = Kokkos::subview(sendarr_shared, Kokkos::make_pair(0,send_plan.cnts(pid)), Kokkos::ALL());

                     GPTLstop("TRANSFER_DATA_SENDARR_SUB");

                 }


                 GPTLstart("TRANSFER_DATA_LOAD_SEND_BUFFER");

                 // Fill send buffer

                 load_buffer(send_plan.cnts(pid), 0, send_plan.displs(pid), sendarr, arrays...);

                 GPTLstop("TRANSFER_DATA_LOAD_SEND_BUFFER");


                 if(!async){

                     // Wait for signal, and then send

                     int ssignal;

                     MPI_Recv(&ssignal, 1, MPI_INTEGER, pid, SIGNAL_TAG, comm, MPI_STATUS_IGNORE);

                 }

                 GPTLstart("TRANSFER_DATA_SEND");

                 if(async){

                     MPI_Isend(sendarr.data(), sendarr.size(), MPI_DOUBLE, pid, MSG_TAG, comm, &(vertex_buffer.srequests[pid]));

                     vertex_buffer.sreq_assigned(pid) = true;

                 }else{

                     MPI_Rsend(sendarr.data(), sendarr.size(), MPI_DOUBLE, pid, MSG_TAG, comm);

                 }

                 GPTLstop("TRANSFER_DATA_SEND");

             }


             if(!async){

                 // wait for messages here

                 if (recv_plan.cnts(pid) > 0){

                     // Note: since using XOR ordering and flow control, no reason to delay the wait

                     MPI_Wait(&(vertex_buffer.rrequests[pid]), MPI_STATUS_IGNORE);

                 }

             }

         }

     }


     return vertex_buffer;

 }


 #endif

VGridDistribution< HostType >

VGridDistribution::n_species
KOKKOS_INLINE_FUNCTION int n_species() const
Definition: vgrid_distribution.hpp:187

VGridDistribution::resize_n_vertices
void resize_n_vertices(int new_n_nodes)
Definition: vgrid_distribution.hpp:241

VGridDistribution::pull_node_index
KOKKOS_INLINE_FUNCTION double & pull_node_index(int inode, int ip) const
Definition: vgrid_distribution.hpp:232

VGridDistribution::n_vr
KOKKOS_INLINE_FUNCTION int n_vr() const
Definition: vgrid_distribution.hpp:191

VGridDistribution::n_vz
KOKKOS_INLINE_FUNCTION int n_vz() const
Definition: vgrid_distribution.hpp:199

domain_decomposition.hpp

Streamed::parallel_for
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252

NoInit
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:69

DistributionPlan
Definition: domain_decomposition.hpp:16

DistributionPlan::displs
View< int *, CLayout, HostType > displs
Definition: domain_decomposition.hpp:18

DistributionPlan::cnts
View< int *, CLayout, HostType > cnts
Definition: domain_decomposition.hpp:17

DistributionPlan::my_rank
int my_rank
Definition: domain_decomposition.hpp:19

MyMPI
Definition: my_mpi.hpp:19

MyMPI::n_plane_ranks
int n_plane_ranks
Definition: my_mpi.hpp:40

MyMPI::my_plane_rank
int my_plane_rank
Definition: my_mpi.hpp:39

MyMPI::plane_comm
MPI_Comm plane_comm
Definition: my_mpi.hpp:38

VertexBuffer
Definition: transfer_vertex_data.hpp:225

VertexBuffer::VertexBuffer
VertexBuffer()
Definition: transfer_vertex_data.hpp:234

VertexBuffer::unload
void unload(int vertex_offset, const Vs &... arrays) const
Definition: transfer_vertex_data.hpp:250

VertexBuffer::send_buffers
std::vector< View< double **, CLayout, Device > > send_buffers
Definition: transfer_vertex_data.hpp:230

VertexBuffer::VertexBuffer
VertexBuffer(int nnode_in, int n_dbl_per_vertex, int n_ranks)
Definition: transfer_vertex_data.hpp:236

VertexBuffer::srequests
std::vector< MPI_Request > srequests
Definition: transfer_vertex_data.hpp:231

VertexBuffer::await_sends
void await_sends()
Definition: transfer_vertex_data.hpp:269

VertexBuffer::sreq_assigned
View< bool *, CLayout, HostType > sreq_assigned
Definition: transfer_vertex_data.hpp:232

VertexBuffer::rreq_assigned
View< bool *, CLayout, HostType > rreq_assigned
Definition: transfer_vertex_data.hpp:228

VertexBuffer::view
View< double **, CLayout, Device > view
Definition: transfer_vertex_data.hpp:226

VertexBuffer::nnode
int nnode() const
Definition: transfer_vertex_data.hpp:257

VertexBuffer::rrequests
std::vector< MPI_Request > rrequests
Definition: transfer_vertex_data.hpp:227

VertexBuffer::await_recvs
void await_recvs()
Definition: transfer_vertex_data.hpp:261

GPTLstart
static int GPTLstart(const char *name)
Definition: timer_macro.hpp:9

GPTLstop
static int GPTLstop(const char *name)
Definition: timer_macro.hpp:10

load_buffer
void load_buffer(int n, int new_offset, int old_offset, const View< double **, CLayout, HostType > &buffer, const Vs &... arrays)
Definition: transfer_vertex_data.hpp:152

total_n_doubles_per_vertex
int total_n_doubles_per_vertex(const V &first)
Definition: transfer_vertex_data.hpp:43

load_arrays
void load_arrays(int n, int new_offset, int old_offset, const View< double **, CLayout, HostType > &buffer, int &arr_offset, const V &array)
Definition: transfer_vertex_data.hpp:114

get_sum_counts
int get_sum_counts(const DistributionPlan &plan)
Definition: transfer_vertex_data.hpp:220

do_prepost_receive_requests
bool do_prepost_receive_requests(const DistributionPlan &plan)
Definition: transfer_vertex_data.hpp:199

unload_arrays
void unload_arrays(int n, int vert_offset, const View< double **, CLayout, HostType > &buffer, int &arr_offset, const V &array)
Definition: transfer_vertex_data.hpp:165

pair
int pair(int np, int p, int k)
Definition: transfer_vertex_data.hpp:6

vertex_access
KOKKOS_INLINE_FUNCTION double & vertex_access(const T &array, int i, int ip)

resize_n_vertices
void resize_n_vertices(int nnodes, T &array)

transfer_data
VertexBuffer< HostType > transfer_data(const DistributionPlan &send_plan, const DistributionPlan &recv_plan, const MyMPI &mpi, bool async, const Vs &... arrays)
Definition: transfer_vertex_data.hpp:282

n_doubles_per_vertex
int n_doubles_per_vertex(const T &array)

get_max_buf_size
int get_max_buf_size(const DistributionPlan &plan)
Definition: transfer_vertex_data.hpp:210

sum_view
T::value_type sum_view(const T &view)
Definition: view_arithmetic.hpp:135