xgca/html/streamed__parallel__for_8hpp_source.html

 #ifndef STREAMED_PARALLEL_FOR_HPP

 #define STREAMED_PARALLEL_FOR_HPP


 namespace Streamed{


 enum StreamJob{

     Sender=0,

     Runner,

     Returner,

     NStreams

 };


 enum Option{

     NoSend=0,

     Normal,

     NoReturn

 };


 enum Tasks{

     ToPinned=0,

     Send,

     Run,

     Return,

     FromPinned,

     NTasks

 };


 // Define streams here; check that USE_STERAMS isn't defined earlier

 #ifdef USE_STREAMS

 #  error Preprocessor flag USE_STREAMS is already in use

 #endif


 // Currently streams only set up for Cuda and only if OpenMP is available):

 #if defined(USE_GPU) && (defined(USE_CUDA) || defined(USE_HIP))

 #  define USE_STREAMS

 //   typedef Kokkos::Cuda GPUStream;

 #  define GPU_STREAM(x) x,

 #elif defined(USE_GPU)

    // If GPU in use, but not Cuda (or no OpenMP), so streams currently unavailable

    struct GPUStream { void fence(){Kokkos::fence();} }; // Call general fence

 #  define GPU_STREAM(x)

 #else

    // CPU only

    struct GPUStream { void fence(){} }; // Fake fence if not using GPUs

 #  define GPU_STREAM(x)

 #endif


 inline int partition_size(int i_partition, int n_soa_on_device, int n_partitions_of_device_aosoa){

 #ifdef USE_STREAMS

     return (n_soa_on_device + n_partitions_of_device_aosoa - i_partition - 1)/n_partitions_of_device_aosoa;

 #else

     // If not using streams, use the full device AoSoA as the first partition, and have two empty place-holder partitions

     return (i_partition==0 ? n_soa_on_device : 0);

 #endif

 }


 template<typename T>

 struct StreamView{


     bool stage_in_pinned_memory;


     T* h_view_ptr;

     T* d_view_ptr;


     T* pinned_send[2];

     T* pinned_return[2];


     template<typename H,typename D>

     StreamView(H& view_h, D& view_d, bool stage_in_pinned_memory_in, int n_on_device, int n_partitions_of_device_view)

         : h_view_ptr((T*)view_h.data()),

           d_view_ptr((T*)view_d.data()),

           stage_in_pinned_memory(stage_in_pinned_memory_in)

     {

 #ifdef USE_STREAMS

         if(stage_in_pinned_memory){

             int i_largest_partition = 0;

             int size_of_largest_partition = partition_size(i_largest_partition, n_on_device, n_partitions_of_device_view);

 #ifdef USE_CUDA

             cudaMallocHost((void**)&pinned_send[0], size_of_largest_partition*sizeof(T));

             cudaMallocHost((void**)&pinned_send[1], size_of_largest_partition*sizeof(T));

             cudaMallocHost((void**)&pinned_return[0], size_of_largest_partition*sizeof(T));

             cudaMallocHost((void**)&pinned_return[1], size_of_largest_partition*sizeof(T));

 #else

             int ierr;

             ierr = hipHostMalloc((void**)&pinned_send[0], size_of_largest_partition*sizeof(T));

             ierr = hipHostMalloc((void**)&pinned_send[1], size_of_largest_partition*sizeof(T));

             ierr = hipHostMalloc((void**)&pinned_return[0], size_of_largest_partition*sizeof(T));

             ierr = hipHostMalloc((void**)&pinned_return[1], size_of_largest_partition*sizeof(T));

 #endif

         }

 #endif

     }


     ~StreamView(){

 #ifdef USE_STREAMS

         if(stage_in_pinned_memory){

 #ifdef USE_CUDA

             cudaFree(pinned_send[0]);

             cudaFree(pinned_send[1]);

             cudaFree(pinned_return[0]);

             cudaFree(pinned_return[1]);

 #else

             int ierr;

             ierr = hipFree(pinned_send[0]);

             ierr = hipFree(pinned_send[1]);

             ierr = hipFree(pinned_return[0]);

             ierr = hipFree(pinned_return[1]);

 #endif

         }

 #endif

     }


     template<typename ST>

     inline void copy_to_device(int offset_h, int offset_d, int n, int i_staged_area, ST& gpu_stream){

         T* host_loc = (stage_in_pinned_memory ? pinned_send[i_staged_area] : h_view_ptr + offset_h);

         Kokkos::View<T*, HostType,   Kokkos::MemoryTraits<Kokkos::Unmanaged>> view_h(host_loc,n);

         Kokkos::View<T*, DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged>> view_d(d_view_ptr + offset_d,n);

         Kokkos::deep_copy(GPU_STREAM(gpu_stream) view_d, view_h);

     }


     template<typename ST>

     inline void copy_to_host(int offset_h, int offset_d, int n, int i_staged_area, ST& gpu_stream){

         T* host_loc = (stage_in_pinned_memory ? pinned_return[i_staged_area] : h_view_ptr + offset_h);

         Kokkos::View<T*, HostType,   Kokkos::MemoryTraits<Kokkos::Unmanaged>> view_r_h(host_loc,n);

         Kokkos::View<T*, DeviceType, Kokkos::MemoryTraits<Kokkos::Unmanaged>> view_r_d(d_view_ptr + offset_d,n);

         Kokkos::deep_copy(GPU_STREAM(gpu_stream) view_r_h, view_r_d);

     }


     inline void copy_to_pinned(int offset_h, int n, int i_staging_area){

         //std::memcpy(pinned_send, h_view_ptr + offset_h, n*sizeof(T));

         #pragma omp parallel for

         for(int i_p = 0; i_p<n; i_p++){

             pinned_send[i_staging_area][i_p] = h_view_ptr[offset_h+i_p];

         }

     }


     inline void copy_from_pinned(int offset_h, int n, int i_staging_area){

         //std::memcpy(h_view_ptr + offset_h, pinned_return, n*sizeof(T));

         #pragma omp parallel for

         for(int i_p = 0; i_p<n; i_p++){

             h_view_ptr[offset_h+i_p] = pinned_return[i_staging_area][i_p];

         }

     }

 };


 struct Task{

     int n;

     int offset;


     Task() : n(0), offset(0){}


     inline void advance(){

         offset += n;

     }

 };


 template<typename Function, typename HostAoSoA, typename DeviceAoSoA>

 void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d){

     // Performance options

 #ifdef USE_STREAMS

     const int desired_n_ptl_per_chunk = 2e6; // How many particles are needed to saturate the GPU

     const bool stage_in_pinned_memory = true;

 #else

     const bool stage_in_pinned_memory = false;

 #endif

     const bool verbose = false;


 #ifdef USE_STREAMS

     // Initialize streams

     auto gpu_streams = Kokkos::Experimental::partition_space(Kokkos::DefaultExecutionSpace(),1,1,1);

 #else

     // If not using streams, create a dummy variable

     std::vector<GPUStream> gpu_streams(NStreams);

 #endif


     // Total number of SoAs that need to be run on

     int n_soa_total = aosoa_h.size()/VEC_LEN;

     if(n_soa_total*VEC_LEN != aosoa_h.size()) {printf("\nERROR: streamed_parallel_for assumes the last SoA in the AoSoA is full\n"); exit(1);}


     // Total amount of SoAs that can fit on the device

     int n_soa_on_device = aosoa_d.size()/VEC_LEN;

     if(n_soa_on_device*VEC_LEN != aosoa_d.size()) {printf("\nERROR: streamed_parallel_for assumes the last device SoA in the AoSoA is full\n"); exit(1);}


     if(n_soa_on_device==0 && n_soa_total!=0) {printf("\nERROR: streamed_parallel_for requires non-zero amount of device memory\n"); exit(1);}


 #ifdef USE_STREAMS

     int desired_n_soa_per_chunk = desired_n_ptl_per_chunk/VEC_LEN;

     int n_partitions_of_device_aosoa = (n_soa_on_device+desired_n_soa_per_chunk-1)/desired_n_soa_per_chunk; // ceiling

     n_partitions_of_device_aosoa = std::max(n_partitions_of_device_aosoa, 3);

 #else

     // If not streaming, then there is no need to partition device AoSoA; use 3 partitions to simplify task loop

     int n_partitions_of_device_aosoa = 3;

 #endif


     // Set up stream view to handle streaming of the aosoa to device and back

     StreamView<typename HostAoSoA::soa_type> stream_view(aosoa_h, aosoa_d, stage_in_pinned_memory, n_soa_on_device, n_partitions_of_device_aosoa);


     // Set up tasks

     std::vector<Task> tasks(NTasks);

     std::vector<Tasks> ordered_tasks;

     if(option!=NoSend){

         if(stage_in_pinned_memory) ordered_tasks.push_back(ToPinned);

         ordered_tasks.push_back(Send);

     }

     ordered_tasks.push_back(Run);

     if(option!=NoReturn){

         ordered_tasks.push_back(Return);

         if(stage_in_pinned_memory) ordered_tasks.push_back(FromPinned);

     }


     int n_soa_remaining = n_soa_total;

     int i = 0;

     bool finished_all = false;

     //GPTLstart("stream_while_loop");

     while(!finished_all){

         if (verbose) printf("\nStep %d", i);

         // Determine number of particles to send in next chunk

         int i_partition = i % n_partitions_of_device_aosoa;

         int p_size = partition_size(i_partition, n_soa_on_device, n_partitions_of_device_aosoa);

         int n_first_op = std::min(p_size, n_soa_remaining);

         n_soa_remaining -= n_first_op;


         // For next chunk, execute on the number of SoAs used by the previous operation

         for (int it=ordered_tasks.size()-1; it>0; it--){

             tasks[ordered_tasks[it]].n = tasks[ordered_tasks[it-1]].n;

         }

         tasks[ordered_tasks[0]].n = n_first_op;


         // For pinned memory, alternate staging areas

         int i_staging_area = i%2;

         int i_staged_area = (i+1)%2;


         // Copy chunk to device by recasting as View

         if(tasks[Send].n>0){

             int offset_send_d = tasks[Send].offset % n_soa_on_device;

             stream_view.copy_to_device(tasks[Send].offset, offset_send_d, tasks[Send].n, i_staged_area, gpu_streams[Sender]);

             if (verbose) printf("\n  Copy (%d - %d) on host to (%d - %d) on device", tasks[Send].offset, tasks[Send].offset+tasks[Send].n, offset_send_d, offset_send_d+tasks[Send].n);

             tasks[Send].advance();

         }


         // Launch parallel_for

         if(tasks[Run].n>0){

             int offset_run_d = tasks[Run].offset % n_soa_on_device;

             // Since this is a GPU-only feature, the parallel_for loops over particles, not vectors

             // Particles bounds are:

             int ptl_offset_d = offset_run_d*VEC_LEN;

             int ptl_last_d = ptl_offset_d + tasks[Run].n*VEC_LEN;


             // Stop at n_ptl even if size of AoSoA is larger

             int n_ptl_d = n_ptl - (tasks[Run].offset-offset_run_d)*VEC_LEN;

             ptl_last_d = std::min(ptl_last_d, n_ptl_d);


             // Launch parallel_for

 #ifdef USE_STREAMS

             Kokkos::parallel_for(name.c_str(), Kokkos::RangePolicy<ExSpace>(gpu_streams[Runner], ptl_offset_d, ptl_last_d), func);

 #else

             Kokkos::parallel_for(name.c_str(), Kokkos::RangePolicy<ExSpace>(ptl_offset_d, ptl_last_d), func);

 #endif

             if (verbose) printf("\n  Run (%d - %d) on device", offset_run_d, offset_run_d+tasks[Run].n);

             if (verbose) printf("\n    i.e. local ptl (%d - %d), global ptl (%d - %d)", ptl_offset_d, ptl_last_d, tasks[Run].offset*VEC_LEN, tasks[Run].offset*VEC_LEN+(ptl_last_d-ptl_offset_d));

             tasks[Run].advance();

         }


         // Copy finished chunk back by recasting as View

         if(tasks[Return].n>0){

             int offset_return_d = tasks[Return].offset % n_soa_on_device;

             stream_view.copy_to_host(tasks[Return].offset, offset_return_d, tasks[Return].n, i_staged_area, gpu_streams[Returner]);

             if (verbose) printf("\n  Copy to (%d - %d) on host from (%d - %d) on device", tasks[Return].offset, tasks[Return].offset+tasks[Return].n, offset_return_d, offset_return_d+tasks[Return].n);

             tasks[Return].advance();

         }


         // Staging in pinned

         // Put staging at end since it is on CPU and thus would block GPU launches if it were placed before them

         if(tasks[ToPinned].n>0){

             stream_view.copy_to_pinned(tasks[ToPinned].offset, tasks[ToPinned].n, i_staging_area);

             if (verbose) printf("\n  Copy (%d - %d) on host to pinned_send", tasks[ToPinned].offset, tasks[ToPinned].offset+tasks[ToPinned].n);

             tasks[ToPinned].advance();

         }


         if(tasks[FromPinned].n>0){

             stream_view.copy_from_pinned(tasks[FromPinned].offset, tasks[FromPinned].n, i_staging_area);

             if (verbose) printf("\n  Copy (%d - %d) to host from pinned_return", tasks[FromPinned].offset, tasks[FromPinned].offset+tasks[FromPinned].n);

             tasks[FromPinned].advance();

         }


         // All processes must be complete before going to the next chunk

         Kokkos::fence();


         // Advance to next partition

         i++;


         // Exit condition: all operations are completed

         finished_all = true;

         for (int it = 0; it<NTasks; it++){ finished_all = finished_all && (tasks[it].n==0); }

     }

     //GPTLstop("stream_while_loop");

     if (verbose) printf("\nComplete in %d steps", i);

 }


 } // Namespace

 #endif

Streamed::Sender
Definition: streamed_parallel_for.hpp:7

Streamed::StreamView::copy_to_device
void copy_to_device(int offset_h, int offset_d, int n, int i_staged_area, ST &gpu_stream)
Definition: streamed_parallel_for.hpp:145

Streamed::Runner
Definition: streamed_parallel_for.hpp:8

Streamed::StreamView::pinned_send
T * pinned_send[2]
Definition: streamed_parallel_for.hpp:69

Streamed::Task::advance
void advance()
Definition: streamed_parallel_for.hpp:218

Streamed::StreamView::~StreamView
~StreamView()
Definition: streamed_parallel_for.hpp:113

Streamed::Task
Definition: streamed_parallel_for.hpp:207

Streamed::Returner
Definition: streamed_parallel_for.hpp:9

Streamed::StreamView
Definition: streamed_parallel_for.hpp:62

Streamed::Run
Definition: streamed_parallel_for.hpp:22

Streamed::StreamView::stage_in_pinned_memory
bool stage_in_pinned_memory
Definition: streamed_parallel_for.hpp:64

Streamed::Tasks
Tasks
Definition: streamed_parallel_for.hpp:19

Streamed::NoReturn
Definition: streamed_parallel_for.hpp:16

Streamed::NoSend
Definition: streamed_parallel_for.hpp:14

Streamed::Task::n
int n
Definition: streamed_parallel_for.hpp:208

Streamed::NStreams
Definition: streamed_parallel_for.hpp:10

Streamed::NTasks
Definition: streamed_parallel_for.hpp:25

Streamed::StreamView::copy_from_pinned
void copy_from_pinned(int offset_h, int n, int i_staging_area)
Definition: streamed_parallel_for.hpp:198

Streamed::ToPinned
Definition: streamed_parallel_for.hpp:20

Streamed::Send
Definition: streamed_parallel_for.hpp:21

Streamed::StreamView::h_view_ptr
T * h_view_ptr
Definition: streamed_parallel_for.hpp:66

Streamed::Option
Option
Definition: streamed_parallel_for.hpp:13

Streamed::StreamView::StreamView
StreamView(H &view_h, D &view_d, bool stage_in_pinned_memory_in, int n_on_device, int n_partitions_of_device_view)
Definition: streamed_parallel_for.hpp:83

Streamed::StreamView::copy_to_host
void copy_to_host(int offset_h, int offset_d, int n, int i_staged_area, ST &gpu_stream)
Definition: streamed_parallel_for.hpp:165

Streamed::StreamJob
StreamJob
Definition: streamed_parallel_for.hpp:6

GPU_STREAM
#define GPU_STREAM(x)
Definition: streamed_parallel_for.hpp:37

Streamed::StreamView::pinned_return
T * pinned_return[2]
Definition: streamed_parallel_for.hpp:70

Streamed::FromPinned
Definition: streamed_parallel_for.hpp:24

Streamed::Task::offset
int offset
Definition: streamed_parallel_for.hpp:209

Streamed::StreamView::copy_to_pinned
void copy_to_pinned(int offset_h, int n, int i_staging_area)
Definition: streamed_parallel_for.hpp:181

Streamed::Normal
Definition: streamed_parallel_for.hpp:15

Streamed::Return
Definition: streamed_parallel_for.hpp:23

Streamed::partition_size
int partition_size(int i_partition, int n_soa_on_device, int n_partitions_of_device_aosoa)
Definition: streamed_parallel_for.hpp:48

Streamed::parallel_for
void parallel_for(const std::string name, int n_ptl, Function func, Option option, HostAoSoA aosoa_h, DeviceAoSoA aosoa_d)
Definition: streamed_parallel_for.hpp:252

Streamed::StreamView::d_view_ptr
T * d_view_ptr
Definition: streamed_parallel_for.hpp:67

Streamed::Task::Task
Task()
Definition: streamed_parallel_for.hpp:211