xgc1/html/load__balance_8hpp_source.html

 #ifndef LOAD_BALANCE_HPP

 #define LOAD_BALANCE_HPP


 #include "timer_macro.hpp"

 #include "shift.hpp"

 #include "count_ptl_per_node.hpp"

 #include "view_arithmetic.hpp"

 #include "f0_redistribute.hpp"


 // Used for original fortran load balancer

 extern "C" void calculate_load_imbalance(double f0_cost);

 extern "C" int assess_whether_to_rebalance_load();

 extern "C" void reset_cost_trackers();

 extern "C" void set_weights(int* gvid0_pid, double* ptl_count, double* f0_node_cost);


 class LoadRegion{

     public:

     enum class UpdateMethod{

         NoHistory=0,

         ExpHistory

     };


     private:


     View<double*,HostType> estimated_time_per_vertex;

     std::string region_name;

     std::vector<std::string> timer_names;

     bool verbose;

     bool model_is_initialized;

     bool model_has_history;

     double predicted_max_region_time;

     double observed_max_region_time;

     double observed_load_imbalance;

     double prediction_undershoot;


 #ifdef USE_MPI

     MPI_Comm inter_period_comm;

     MPI_Comm period_comm;

 #endif

     int n_periods;

     int n_unique_ranks;

     int my_period_rank;


     double time_accumulated;


     UpdateMethod update_method;


     void update_model_no_history(const View<int*,CLayout,HostType>& current_partition, const View<double*, HostType>& all_periods_timings){

         // Loop over ranks' timers

         for(int i=0; i<all_periods_timings.size(); i++){

             int node_offset = current_partition(i) - 1;

             int next_node_offset = current_partition(i+1) - 1;

             int nnodes = next_node_offset - node_offset;

             // Best 0th order estimate is to assume all vertices took the same time

             double time_per_node = all_periods_timings(i)/nnodes;

             for(int j=0; j<nnodes; j++){

                 estimated_time_per_vertex(j + node_offset) = time_per_node;

             }

         }

     }


     void update_model_exp_history(const View<int*,CLayout,HostType>& current_partition, const View<double*, HostType>& all_periods_timings){

         // Loop over ranks' timers

         for(int i=0; i<all_periods_timings.size(); i++){

             // Get partition owned by this rank

             int node_offset = current_partition(i) - 1;

             int next_node_offset = current_partition(i+1) - 1;

             int nnodes = next_node_offset - node_offset;


             // Get average time per node for this rank

             double avg_time_per_node = all_periods_timings(i)/nnodes;


             // Get expected time based on existing model

             double expected_time = 0.0;

             for(int j=0; j<nnodes; j++){

                 expected_time += estimated_time_per_vertex(j + node_offset);

             }

             double expected_time_per_node = expected_time/nnodes;


             // Difference between observed time and expected time

             double extra_time_per_node = avg_time_per_node - expected_time_per_node;


             for(int j=0; j<nnodes; j++){

                 // Distribute extra time evenly

                 constexpr double adjustment_rate = 0.5;


                 estimated_time_per_vertex(j + node_offset) += extra_time_per_node*adjustment_rate;

             }

         }

     }


     double get_time_since_previous_call(){

         // Get time from camtimers (accumulated)

         double new_accumulated_time = 0.0;


         // Loop through all timers associated with this region, and add the accumulated time for each

         for(int i=0; i<timer_names.size(); i++){

             double accumulated_single_timer = 0;

             int ierr = GPTLget_wallclock(timer_names[i].c_str(), -1, &accumulated_single_timer);


             // If ierr != 0, timer is not yet defined, so the accumulated time is zero

             if(ierr != 0) accumulated_single_timer = 0.0;


             new_accumulated_time += accumulated_single_timer;

         }


         // Subtract previous accumulated time to get time spent since last update

         double time_spent = new_accumulated_time - time_accumulated;


         // Save new accumulated time

         time_accumulated = new_accumulated_time;


         return time_spent;

     }


     void touch_timers(){

         for(int i=0; i<timer_names.size(); i++){

             GPTLstart(timer_names[i].c_str());

             GPTLstop(timer_names[i].c_str());

         }

     }


     public:


 #ifdef USE_MPI

     LoadRegion(int n_vertices, const MPI_Comm& inter_period_comm, const MPI_Comm& period_comm, UpdateMethod update_method, bool verbose, std::string region_name, std::vector<std::string> timer_names)

         : model_is_initialized(false),

           verbose(verbose),

           time_accumulated(0.0),

           inter_period_comm(inter_period_comm),

           period_comm(period_comm),

           estimated_time_per_vertex("estimated_time_per_vertex",n_vertices),

           prediction_undershoot(1.0),

           update_method(update_method),

           model_has_history(false),

           region_name(region_name),

           timer_names(timer_names)

     {

         // Touch GPTL timers to be sure they exist

         touch_timers();


         // Initialize timer

         reset_timer();


         // Get problem size from comms

         MPI_Comm_rank(period_comm, &my_period_rank);

         MPI_Comm_size(period_comm, &n_unique_ranks);

         MPI_Comm_size(inter_period_comm, &n_periods);

     }

 #endif


     void reset_timer(){

         // Reset timer by getting the time since the previous call and suppressing the output

         double time_spent = get_time_since_previous_call();

     }


     double get_estimated_time_per_vertex(int i) const{

         return estimated_time_per_vertex(i);

     }


     View<double*,HostType> get_estimated_time_per_vertex() const{

         return estimated_time_per_vertex;

     }


     double get_prediction_undershoot() const{

         return prediction_undershoot;

     }


     double get_observed_max_region_time() const{

         return observed_max_region_time;

     }


     double get_observed_load_imbalance() const{

         return observed_load_imbalance;

     }


     bool get_model_is_initialized() const{

         return model_is_initialized;

     }


     // Predict performance of new partition based on model

     double get_largest_predicted_time(const View<int*,CLayout,HostType>& proposed_partition) const{

         double largest_time = 0.0;

         //int largest_time_ind = 0;

         for(int i=0; i<(proposed_partition.size()-1); i++){

             double proc_time = 0.0;

             for(int i_node=proposed_partition(i)-1; i_node<proposed_partition(i+1)-1; i_node++){

                 proc_time += estimated_time_per_vertex(i_node);

             }

             if(proc_time>largest_time){

                 //largest_time_ind = i;

                 largest_time = proc_time;

             }

         }

         return largest_time;

     }


     void initialize_model(){

         reset_timer();


         // Model can now be used

         model_is_initialized = true;

     }


     void update_model(const View<int*,CLayout,HostType>& current_partition, double manual_time=-1.0){

         if(update_method==UpdateMethod::NoHistory || update_method==UpdateMethod::ExpHistory){


             // Allocate for timing of each

             View<double*, HostType> all_periods_timings(NoInit("all_periods_timings"), n_unique_ranks);


 #ifdef USE_MPI

             // Get time spent since last model update

             // Time can be entered manually; it's used if positive (i.e. the input was provided)

             double time_spent_this_rank = (manual_time<0.0 ? get_time_since_previous_call()

                                                            : manual_time);


             // Reduce onto plane 0

             // Could try MPI_SUM rather than MPI_MAX

             double time_spent;

             if(n_periods>1){

                 MPI_Reduce(&time_spent_this_rank, &time_spent, 1, MPI_DOUBLE, MPI_MAX, 0, inter_period_comm);

             }else{

                 time_spent = time_spent_this_rank;

             }


             // Gather from all ranks in plane 0

             MPI_Gather(&time_spent, 1, MPI_DOUBLE, all_periods_timings.data(), 1, MPI_DOUBLE, 0, period_comm);

 #endif


             if (is_rank_zero()){

                 // Look at the timing prediction made last time

                 predicted_max_region_time = get_largest_predicted_time(current_partition);

                 if(verbose) printf("\nPredicted max time in this region was %1.5e", predicted_max_region_time);


                 // Get max region time

                 observed_max_region_time = 0.0;

                 double observed_sum_region_time = 0.0;

                 for(int i=0; i<all_periods_timings.size(); i++){

                     observed_max_region_time = std::max(observed_max_region_time, all_periods_timings(i));

                     observed_sum_region_time += all_periods_timings(i);

                 }

                 double observed_idle_region_time = observed_max_region_time*all_periods_timings.size() - observed_sum_region_time;

                 observed_load_imbalance = observed_idle_region_time/observed_sum_region_time;

                 if(verbose) printf("\nObserved max time in this region was %1.5e", observed_max_region_time);

                 if(verbose) printf("\n - Load imbalance (T_idle/T_work): %1.2f%%", observed_load_imbalance*100);


                 // Take the max here so that there is never an assumed overshoot

                 if(predicted_max_region_time!=0.0){ // If time is zero, there hasn't been a prediction yet

                     prediction_undershoot = std::max(observed_max_region_time/predicted_max_region_time, 1.0);

                 }

                 if(verbose) printf("\nThe prediction undershot by a factor of %1.5e", observed_max_region_time/predicted_max_region_time);


                 if(update_method==UpdateMethod::NoHistory){

                     update_model_no_history(current_partition, all_periods_timings);

                 }else if(update_method==UpdateMethod::ExpHistory){

                     if(model_has_history){

                         update_model_exp_history(current_partition, all_periods_timings);

                     }else{

                         update_model_no_history(current_partition, all_periods_timings);

                         model_has_history = true;

                     }

                 }

             }

         }else{

             exit_XGC("Error: Update method not available\n");

         }

     }

 };


 class LoadBalance{

     public:

     enum class WeightingAlgorithm{

         SingleRegionBalance=0,

         ParticleBalance,

         Fortran,

         Default

     };


     enum class ConstraintOption{

         ParticleCount=0

     };


     enum class ReweightOption{

         Always=0,

         IfBetter

     };


     private:


     WeightingAlgorithm default_weighting_algorithm;


     std::vector<LoadRegion> regions;


     View<int*,CLayout,HostType> proposed_partition;


 #ifdef USE_MPI

     MPI_Comm comm;

 #endif


     double threshold_to_rebalance;

     bool verbose;


     double constraint1_max;


     double get_even_division(const View<double*,HostType>& input, int n) const{

         double total = 0.0;

         Kokkos::parallel_reduce("sum", Kokkos::RangePolicy<HostExSpace>(0,input.size()), [=]( const int i, double& l_total){

             l_total += input(i);

         }, total);


         return (total/n);

     }


     bool greedily_fill_partition(const View<double*,HostType>& weight, const View<double*,HostType>& constraint1, double target_weight_per_rank){

         int nnode = weight.size();

         int nproc = proposed_partition.size()-1;


         // Start from rank 0 of the plane; assign nodes until it meets the desired weight per rank, then move to next rank

         double constraint1_in_this_rank = 0.0;

         double weight_in_this_rank = 0.0;

         proposed_partition(0) = 1; // 1-indexed

         int pid = 0;

         bool assign_one_node_per_proc = false;

         for(int i_node = 0; i_node<nnode; i_node++){

             // Since every rank needs at last one node, switch to assigning at every

             // iteration if there are only as many nodes left as ranks

             if(nnode-i_node==nproc-pid) assign_one_node_per_proc = true;


             // Check if a criterion is met to consider the rank sufficiently loaded

             bool rank_is_loaded = false;


             // Criterion 1: We are out of nodes

             if(assign_one_node_per_proc) rank_is_loaded = true;


             // Criterion 2: We have hit a constraint

             if(constraint1_in_this_rank>=constraint1_max) rank_is_loaded = true;


             // Criterion 3: We have loaded the rank with an even amount of work

             if(weight_in_this_rank>=target_weight_per_rank) rank_is_loaded = true;


             // If there are enough particles assigned to this rank, move to the next rank

             if(rank_is_loaded){

                 proposed_partition(pid+1) = i_node+1; // 1-indexed

                 if(verbose) printf("\nRank %d is loaded (Nodes %d-%d); weight=%1.4e, ptl*nplanes=%d", pid, proposed_partition(pid), proposed_partition(pid+1)-1, weight_in_this_rank, int(constraint1_in_this_rank));

                 constraint1_in_this_rank = 0.0;

                 weight_in_this_rank = 0.0;

                 pid++;

                 if(pid==nproc-1) break;

             }

             constraint1_in_this_rank += constraint1(i_node);

             weight_in_this_rank += weight(i_node);

         }


         // Last value will always be nnode+1

         proposed_partition(nproc) = nnode+1;


         // Check n_ptl in final rank

         constraint1_in_this_rank = 0.0;

         for(int i_node=proposed_partition(nproc-1)-1; i_node<nnode; i_node++){

             constraint1_in_this_rank += constraint1(i_node);

         }

         if(constraint1_in_this_rank>=constraint1_max){

             // Return false if the constraint was not met

             return false;

         }else{

             return true;

         }

     }


     // Predict performance of new partition based on model

     double get_largest_predicted_time(const View<int*,CLayout,HostType>& partition, const View<double*,HostType>& weight) const{

         double largest_time = 0.0;

         //int largest_time_ind = 0;

         for(int i=0; i<(partition.size()-1); i++){

             double proc_time = 0.0;

             for(int i_node=partition(i)-1; i_node<partition(i+1)-1; i_node++){

                 proc_time += weight(i_node);

             }

             if(proc_time>largest_time){

                 //largest_time_ind = i;

                 largest_time = proc_time;

             }

         }

         return largest_time;

     }


     void one_weight_balance(const View<double*,HostType>& weight, const View<double*,CLayout,HostType> constraint1){

         int nproc = proposed_partition.size()-1;


         // Ideally, each rank would get the same amount of work, i.e. the average

         double ideal_weight_per_rank = get_even_division(weight, nproc);


         // Make initial attempt, targeting even distribution of work

         bool meets_constraints = greedily_fill_partition(weight, constraint1, ideal_weight_per_rank);


         if(!meets_constraints){

             // An equal work distribution cannot be assigned due to a constraint.

             // First, determine the weights if only the constraint were followed

             meets_constraints = greedily_fill_partition(constraint1, constraint1, get_even_division(constraint1, nproc));

             if(!meets_constraints) exit_XGC("\nUnexpected issue in load balance: constraint-based partition doesn't satisfy constraints\n");


             // This partition meets the constraints, but the proposed timing is likely unacceptable

             double upper_limit_weight_per_rank = get_largest_predicted_time(proposed_partition, weight);

             if(verbose) printf("The ideal distribution (%1.3e) does not satisfy constraints. This distribution (%1.3e) does.\n", ideal_weight_per_rank, upper_limit_weight_per_rank);


             // Bisect the difference between the original target time and the upper limit, and see if that satisfies the constraints

             // Continue until the load balance has been minimized to a precision based on the the original desired time

             const double desired_precision = 0.01; // Fraction of imbalance that's worth even checking whether it can be removed

             double desired_step_size = ideal_weight_per_rank*desired_precision;

             double step_size = upper_limit_weight_per_rank - ideal_weight_per_rank;

             double compromise_weight_per_rank = upper_limit_weight_per_rank;

             if(verbose) printf("\nEmploying a binary search to narrow down on the best option.");

             while(step_size>desired_step_size){

                 // Halve the size of the step

                 step_size /= 2;


                 if(meets_constraints){

                     // Try reducing the load imbalance since constraints were met

                     compromise_weight_per_rank -= step_size;

                 }else{

                     // Try raising the load imbalance since constraints were broken

                     compromise_weight_per_rank += step_size;

                 }


                 // Get new partition

                 meets_constraints = greedily_fill_partition(weight, constraint1, compromise_weight_per_rank);

                 if(verbose) printf("\n  Stepped by %1.3e to %1.3e. The new partition does%s meet constraints.", step_size, compromise_weight_per_rank, meets_constraints?"" : "nt");

             }

             // In case we end at a partition that fails

             while(!meets_constraints){

                 compromise_weight_per_rank += step_size;

                 meets_constraints = greedily_fill_partition(weight, constraint1, compromise_weight_per_rank);

                 if(verbose) printf("\n  Stepped by %1.3e UP to %1.3e. The new partition does%s meet constraints.", step_size, compromise_weight_per_rank, meets_constraints?"" : "nt");

             }

             if(verbose) printf("\n");

         }

     }


     // TODO

     // Evaluate overall model performance

     // MAIN_LOOP time vs sum of regions (% coverage)

     //


     /* This function assesses the proposed partition and decides whether it is worth switching to.

      * The simplest formulation is to recommend the new partition if the new partition is predicted to be faster

      * than the existing one. There is also a factor to account for historical inaccuracy of the previous estimate.

      * A future option would be to require a certain level of improvement before repartitioning. */

     bool recommend_proposed_partition(){

         // Total time spent in regions (assuming that global barriers demarcate them):

         double observed_max_all_rgns_time = 0.0;

         for(int i=0; i<regions.size(); i++){

             observed_max_all_rgns_time += regions[i].get_observed_max_region_time();

         }


         // Total predicted time, accounting for observed prediction undershoot for each region

         double predicted_max_all_rgns_time = 0.0;

         for(int i=0; i<regions.size(); i++){

             predicted_max_all_rgns_time += regions[i].get_largest_predicted_time(proposed_partition)*regions[i].get_prediction_undershoot();

         }


         double fractional_improvement = 1.0 - predicted_max_all_rgns_time/observed_max_all_rgns_time;


         if(verbose){

             printf("\nDetermining whether to adopt the proposed partition:");

             printf("\n Observed time with current partition was: %1.3e", observed_max_all_rgns_time);

             printf("\n Predicted time with proposed partition, adjusting for historical undershoot (of Rgn 0: %1.3e): %1.3e", regions[0].get_prediction_undershoot(), predicted_max_all_rgns_time);

             printf("\n Adopted if Fractional improvement of adjusted prediction (%1.3e) exceeds specified threshold (%1.3e)\n", fractional_improvement, threshold_to_rebalance);

         }


         // Recommend the new partition if the predicted time is less than the time observed with the current partition

         if(fractional_improvement > threshold_to_rebalance){

             return true;

         }else{

             return false;

         }

     }


     void print_new_partition(){

         // Print new partition

         for(int i = 0; i<proposed_partition.size()-1; i++){

             int nnodes_for_proc = proposed_partition(i+1) - proposed_partition(i);

             printf("process %d : nnodes = %d; range = (%d - %d)\n", i, nnodes_for_proc, proposed_partition(i), proposed_partition(i+1)-1);

         }

     }


     void update_model(const View<int*,CLayout,HostType>& current_partition){

         for(int i=0; i<regions.size(); i++){

             regions[i].update_model(current_partition);

         }

     }


     // Overload if providing timings manually rather than using camtimers. This is used for testing.

     void update_model(const View<int*,CLayout,HostType>& current_partition, const std::vector<double>& manual_times){

         for(int i=0; i<regions.size(); i++){

             regions[i].update_model(current_partition, manual_times[i]);

         }

     }


     void initialize_model(){

         for(int i=0; i<regions.size(); i++){

             regions[i].initialize_model();

         }

     }


     bool model_is_initialized(){

         bool is_initialized=true;

         for(int i=0; i<regions.size(); i++){

             is_initialized = (is_initialized && regions[i].get_model_is_initialized());

         }

         return is_initialized;

     }


     void propose_new_partition(const Kokkos::View<double*,Kokkos::LayoutRight,HostType>& ptl_count, WeightingAlgorithm weighting_algorithm){

         auto constraint1 = ptl_count; // Constrain based on particle count (memory)

         if(weighting_algorithm == WeightingAlgorithm::SingleRegionBalance){

             if(regions.size()!=1) exit_XGC("Error: Load balancing is currently single-region only\n");

             one_weight_balance(regions[0].get_estimated_time_per_vertex(), constraint1);

         }else if(weighting_algorithm == WeightingAlgorithm::ParticleBalance){

             one_weight_balance(ptl_count, constraint1);

         }


         if(verbose){

             printf("\nNewly PROPOSED partition:\n");

             print_new_partition();

         }

     }


     void set_new_partition(const Simulation<DeviceType>& sml, const Grid<DeviceType>& grid, const MagneticField<DeviceType>& magnetic_field,

                    const VelocityGrid& vgrid, Plasma& plasma, DomainDecomposition<DeviceType>& pol_decomp, WeightingAlgorithm weighting_algorithm){

         if(weighting_algorithm==WeightingAlgorithm::Fortran){

             // In the fortran algorithm the new distribution is calculated and set here

             // Count number of particles belonging to each node

             Kokkos::View<double**,Kokkos::LayoutRight,HostType> ptl_count = count_ptl_per_node_elec_main_ion(grid, magnetic_field, plasma, sml.electron_on);


             if(pol_decomp.gvid0_pid_h.size()==2){

                 // Skip set_weights if there is only one rank per plane

                 pol_decomp.gvid0_pid_h(0) = 1;

                 pol_decomp.gvid0_pid_h(1) = pol_decomp.nnodes_on_plane + 1;

             }else{

                 TIMER("SET_WEIGHTS",

                     set_weights(pol_decomp.gvid0_pid_h.data(), ptl_count.data(), plasma.f0_node_cost.data()) );

             }

         }else{

 #ifdef USE_MPI

             // Broadcast proposal to all ranks

             MPI_Bcast(proposed_partition.data(), proposed_partition.size(), MPI_INT, 0, comm);

 #endif


             // Copy proposal to pol_decomp

             Kokkos::deep_copy(pol_decomp.gvid0_pid_h, proposed_partition);


             if(is_rank_zero()){

                 printf("\nNEW PARTITION:\n");

                 print_new_partition();

             }

         }


         // Update bounds and device copy of the decomposition array

         pol_decomp.update_pol_decomp();

         GPTLstart("update_flux_surf");

         pol_decomp.update_flux_surf(grid.flux_surfaces_h);

         GPTLstop("update_flux_surf");

     }


     void redistribute_load(const Simulation<DeviceType>& sml, const Grid<DeviceType>& grid, const MagneticField<DeviceType>& magnetic_field,

                    const VelocityGrid& vgrid, Plasma& plasma, DomainDecomposition<DeviceType>& pol_decomp,

                    const View<int*,CLayout,HostType>& old_partition){

         if (plasma.f0_grid()){

             // Move f information to correct MPI rank after decomposition update

             TIMER("F0_REDISTRIBUTE",

                 f0_redistribute(plasma, pol_decomp, grid, magnetic_field, vgrid, old_partition) );

         }


         // Shift particles to correct MPI rank after decomposition update

         TIMER("SHIFT_R",

             shift_all_species(plasma, grid, magnetic_field, pol_decomp, Shift::NoPhase0) );

     }


     bool will_rebalance(ReweightOption reweight_option, WeightingAlgorithm weighting_algorithm, double f0_cost){

         if(reweight_option==ReweightOption::Always){

             // No need to check if model is an improvement

             return true;

         }else{

             if(weighting_algorithm==WeightingAlgorithm::Fortran){

                 // Calculate load imbalance

                 calculate_load_imbalance(f0_cost);


                 // Assess if load imbalance merits a rebalancing

                 int should_rebalance_int = assess_whether_to_rebalance_load();


                 // Reset fortran cost trackers

                 reset_cost_trackers();


                 return (should_rebalance_int==1);

             }else{

                 // Evaluate the proposed partition on rank 0

                 bool should_rebalance;

                 if(weighting_algorithm==WeightingAlgorithm::ParticleBalance){

                     // Since ParticleBalance doesn't contain an internal model, cant evaluate the partition

                     // so just always approve, for now.

                     // Probably better to have the model updatable by something other than time so it can still

                     // be assessed in this mode

                     return true;

                 }else{

                     if(is_rank_zero()) should_rebalance = recommend_proposed_partition();

                     // Convert to int because MPI_C_BOOL documentation is confusing

                     int should_rebalance_int = (should_rebalance ? 1 : 0);

 #ifdef USE_MPI

                     MPI_Bcast(&should_rebalance_int, 1, MPI_INT, 0, comm);

 #endif

                     return (should_rebalance_int==1);

                 }

             }

         }

     }


     public:


     LoadBalance(NLReader::NamelistReader& nlr, const DomainDecomposition<DeviceType>& pol_decomp, bool sync_planes = true)

       : proposed_partition(NoInit("proposed_partition"), pol_decomp.gvid0_pid_h.layout())

 #ifdef USE_MPI

         , comm(sync_planes ? pol_decomp.mpi.comm : pol_decomp.mpi.plane_comm)

 #endif

     {

         double max_mem_redist_gb = 10.0;

         std::string weighting_algorithm_str = "Fortran";

         std::string update_method_str = "NoHistory";

         if(nlr.namelist_present("load_balance_param")){

             nlr.use_namelist("load_balance_param");

             max_mem_redist_gb = nlr.get<double>("max_mem_redist_gb", 10.0); // Sets the maximum amount of memory per rank that can be allocated to particles, in gigabytes.

             weighting_algorithm_str = nlr.get<std::string>("weighting_algorithm", "Fortran"); // Load balancing method. "Fortran" is the default since the newer methods are experimental. "ParticleBalance" does not incorporate timing data, but only tries to equally distribute particles. "SingleRegionBalance" currently tries to balance the push.

             threshold_to_rebalance = nlr.get<double>("threshold_to_rebalance", 0.02); // How much better the projected timing of a proposed load redistribution must be than the current distribution in order to adopt the new one. e.g. 0.02 sets a threshold of 2% better.

             verbose = nlr.get<bool>("verbose", false); // Verbose output of the internal load distribution model and load balancing decisions.

             update_method_str = nlr.get<std::string>("update_method", "NoHistory"); // Methods for updating internal model of load distribution. "NoHistory" uses only the most recent time step, while "ExpHistory" averages the existing model with the new step's timing information. Does not apply when weighting_algorithm is "Fortran".

         }else{

             threshold_to_rebalance = 0.02;

             verbose = false;

         }


         // Simple model for memory usage from load balance - just particles

         double max_n_ptl_on_rank = max_mem_redist_gb*1024*1024*1024/80.0; // ~ 80 B per particle


         if(weighting_algorithm_str=="Fortran"){

             default_weighting_algorithm = WeightingAlgorithm::Fortran;

         }else if(weighting_algorithm_str=="ParticleBalance"){

             default_weighting_algorithm = WeightingAlgorithm::ParticleBalance;

         }else if(weighting_algorithm_str=="SingleRegionBalance"){

             default_weighting_algorithm = WeightingAlgorithm::SingleRegionBalance;

         }else{

             exit_XGC("\nError: weighting_algorithm input not valid.");

         }


 #ifdef USE_MPI

         // If planes are sync'd, use intpl_comm to coordinate them; otherwise, use MPI_COMM_SELF

         MPI_Comm inter_period_comm = sync_planes ? pol_decomp.mpi.intpl_comm : MPI_COMM_SELF;


         // Set up load balance regions

         if(default_weighting_algorithm == WeightingAlgorithm::SingleRegionBalance){

             LoadRegion::UpdateMethod update_method;

             if(update_method_str=="NoHistory"){

                 update_method = LoadRegion::UpdateMethod::NoHistory;

             }else if(update_method_str=="ExpHistory"){

                 update_method = LoadRegion::UpdateMethod::ExpHistory;

             }else{

                 exit_XGC("\nError: update_method input not valid.");

             }


             regions.push_back(LoadRegion(pol_decomp.nnodes_on_plane, inter_period_comm, pol_decomp.mpi.plane_comm, update_method, verbose,

                                          "Push",

                                          {"ipc1:PUSHE",

                                           "ipc1:PUSHI",

                                           "ipc2:PUSHE",

                                           "ipc2:PUSHI"}));

         }


         // Hard-code for now

         ConstraintOption constraint_option = ConstraintOption::ParticleCount;


         if(constraint_option == ConstraintOption::ParticleCount){

             // Normalize by n_planes since we are using the sum of planes as the constraint

             int n_periods;

             MPI_Comm_size(inter_period_comm, &n_periods);

             constraint1_max = max_n_ptl_on_rank*n_periods;

         }else{

             exit_XGC("\nError: ParticleConstraint is only available ConstraintOption.\n");

         }

 #endif

     }


     void rebalance(const Simulation<DeviceType>& sml, const Grid<DeviceType>& grid, const MagneticField<DeviceType>& magnetic_field,

                    const VelocityGrid& vgrid, Plasma& plasma, DomainDecomposition<DeviceType>& pol_decomp, ReweightOption reweight_option,

                    WeightingAlgorithm weighting_algorithm = WeightingAlgorithm::Default){


         // Skip rebalancing if poloidal decomposition is turned off. Normally we should put the function in

         // an if statement rather than using return; kept here in case we might want some balancing for some reason

         // even without poloidal decomposition

         if(pol_decomp.pol_decomp==false) return;


         // Rebalance is trivial if there is only one rank on a plane (generalize this from no-MPI case)

 #ifdef USE_MPI


         if(weighting_algorithm == WeightingAlgorithm::Default){

             weighting_algorithm = default_weighting_algorithm;

         }


         if(default_weighting_algorithm==WeightingAlgorithm::Fortran){

             // Override input algorithm; if the default is Fortran, it should always use the fortran load balancer

             weighting_algorithm = default_weighting_algorithm;

         }


         if(is_rank_zero() && verbose){

             if(weighting_algorithm==WeightingAlgorithm::Fortran) printf("\nLoad balance called with Fortran algorithm\n");

             else if(weighting_algorithm==WeightingAlgorithm::ParticleBalance) printf("\nLoad balance called with Ptl algorithm\n");

             else if(weighting_algorithm==WeightingAlgorithm::SingleRegionBalance) printf("\nLoad balance called with SingleRegion algorithm\n");

         }


         GPTLstart("REBALANCE");

         if(weighting_algorithm!=WeightingAlgorithm::Fortran){

             GPTLstart("PTL_COUNT");

             plasma.for_all_nonadiabatic_species([&](Species<DeviceType>& species){

                 long long int total_n_ptl = species.get_total_n_ptl();

                 int max_n_ptl = species.get_max_n_ptl();

                 double avg_n_ptl = (double)(total_n_ptl)/pol_decomp.mpi.nranks;

                 double max_ratio = max_n_ptl/avg_n_ptl;

                 if(is_rank_zero()) printf("  Species %d: (max/avg ptl per rank = %1.2f); total n_ptl = %lld\n", species.idx, max_ratio, total_n_ptl);

             }, Plasma::NoDevicePtl);

             GPTLstop("PTL_COUNT");


             GPTLstart("PTL_COUNT_PER_NODE");

             auto ptl_count = count_all_ptl_per_node(grid, magnetic_field, plasma);

             GPTLstop("PTL_COUNT_PER_NODE");


             if(weighting_algorithm!=WeightingAlgorithm::ParticleBalance){

                 if(model_is_initialized()){

                     // Update the model with the latest timing

                     TIMER("LOAD_BAL_UPDATE",

                         update_model(pol_decomp.gvid0_pid_h) );

                 }else{

                     if(is_rank_zero() && verbose) printf("Initializing timing model, no partition proposal yet.");

                     initialize_model();

                     GPTLstop("REBALANCE");

                     return;

                 }

             }


             // With the updated model, proposed a new partition

             GPTLstart("LOAD_BAL_NEW_PART");

             if (is_rank_zero()) propose_new_partition(ptl_count, weighting_algorithm);

             GPTLstop("LOAD_BAL_NEW_PART");

         }


         GPTLstart("LOAD_BAL_REBAL");

         double f0_cost = plasma.f0_grid() ? sum_view(plasma.f0_node_cost) : 0.0;

         if(will_rebalance(reweight_option, weighting_algorithm, f0_cost)){

             // Save the old partition, it is used to save some communication in the f0 redistribution

             View<int*,CLayout,HostType> old_partition(NoInit("old_partition"), pol_decomp.gvid0_pid_h.layout());

             Kokkos::deep_copy(old_partition, pol_decomp.gvid0_pid_h);


             // Update pol_decomp

             TIMER("LOAD_BAL_SET_NEW",

                 set_new_partition(sml, grid, magnetic_field, vgrid, plasma, pol_decomp, weighting_algorithm) );


             // Update particles and f0

             TIMER("LOAD_BAL_REDIST",

                 redistribute_load(sml, grid, magnetic_field, vgrid, plasma, pol_decomp, old_partition) );

         }

         GPTLstop("LOAD_BAL_REBAL");

         GPTLstop("REBALANCE");

 #endif

     }


     // More generic rebalance

     void rebalance(DomainDecomposition<DeviceType>& pol_decomp, const View<double*,CLayout,HostType>& constraint, const std::vector<double>& timings, double& load_imbalance, View<double*,HostType>& model_belief){

 #ifdef USE_MPI

         update_model(pol_decomp.gvid0_pid_h, timings);


         load_imbalance = regions[0].get_observed_load_imbalance();

         model_belief = regions[0].get_estimated_time_per_vertex();


         // Propose new partition

         if (is_rank_zero()) propose_new_partition(constraint, default_weighting_algorithm);


         double f0_cost = 0.0;

         if(will_rebalance(ReweightOption::IfBetter, default_weighting_algorithm, f0_cost)){

             // Broadcast proposal to all ranks

             MPI_Bcast(proposed_partition.data(), proposed_partition.size(), MPI_INT, 0, comm);


             // Copy proposal to pol_decomp

             Kokkos::deep_copy(pol_decomp.gvid0_pid_h, proposed_partition);


             // Update bounds and device copy of the decomposition array

             pol_decomp.update_pol_decomp();


             // If there were a real load, redistribute it here

         }

 #endif

     }

 };


 #endif

LoadBalance::WeightingAlgorithm::Default

LoadRegion::model_has_history
bool model_has_history
Definition: load_balance.hpp:30

LoadRegion::estimated_time_per_vertex
View< double *, HostType > estimated_time_per_vertex
Definition: load_balance.hpp:25

GPTLstart
static int GPTLstart(const char *name)
Definition: timer_macro.hpp:9

LoadBalance::will_rebalance
bool will_rebalance(ReweightOption reweight_option, WeightingAlgorithm weighting_algorithm, double f0_cost)
Definition: load_balance.hpp:580

LoadRegion::UpdateMethod::ExpHistory

is_rank_zero
bool is_rank_zero()
Definition: globals.hpp:27

LoadBalance::one_weight_balance
void one_weight_balance(const View< double *, HostType > &weight, const View< double *, CLayout, HostType > constraint1)
Definition: load_balance.hpp:388

LoadRegion::UpdateMethod::NoHistory

LoadBalance::threshold_to_rebalance
double threshold_to_rebalance
Definition: load_balance.hpp:300

NLReader::NamelistReader::get
T get(const string &param, const T default_val, int val_ind=0)
Definition: NamelistReader.hpp:373

LoadRegion::time_accumulated
double time_accumulated
Definition: load_balance.hpp:44

shift.hpp

Plasma::f0_grid
bool f0_grid() const
Definition: plasma.hpp:196

LoadRegion::predicted_max_region_time
double predicted_max_region_time
Definition: load_balance.hpp:31

LoadRegion::observed_max_region_time
double observed_max_region_time
Definition: load_balance.hpp:32

LoadBalance::get_largest_predicted_time
double get_largest_predicted_time(const View< int *, CLayout, HostType > &partition, const View< double *, HostType > &weight) const
Definition: load_balance.hpp:372

VelocityGrid
Definition: velocity_grid.hpp:8

Simulation
Definition: sml.hpp:8

LoadRegion::update_model_exp_history
void update_model_exp_history(const View< int *, CLayout, HostType > &current_partition, const View< double *, HostType > &all_periods_timings)
Definition: load_balance.hpp:62

LoadRegion::n_unique_ranks
int n_unique_ranks
How many ranks are in a &#39;period&#39; (in tokamaks, in a plane)
Definition: load_balance.hpp:41

LoadBalance::LoadBalance
LoadBalance(NLReader::NamelistReader &nlr, const DomainDecomposition< DeviceType > &pol_decomp, bool sync_planes=true)
Definition: load_balance.hpp:620

LoadBalance::WeightingAlgorithm::Fortran

plasma
subroutine plasma(grid, itr, p, dene_out, deni_out, Te_out, Ti_out, Vparai_out)
Calculate the plasma density, temperature, and parallel velocity for a point in triangle itr using pl...
Definition: neutral_totalf.F90:1248

LoadBalance::rebalance
void rebalance(DomainDecomposition< DeviceType > &pol_decomp, const View< double *, CLayout, HostType > &constraint, const std::vector< double > &timings, double &load_imbalance, View< double *, HostType > &model_belief)
Definition: load_balance.hpp:775

Plasma::NoDevicePtl
Definition: plasma.hpp:101

NLReader::NamelistReader
Definition: NamelistReader.hpp:193

MagneticField
Definition: magnetic_field.hpp:12

LoadBalance::propose_new_partition
void propose_new_partition(const Kokkos::View< double *, Kokkos::LayoutRight, HostType > &ptl_count, WeightingAlgorithm weighting_algorithm)
Definition: load_balance.hpp:514

Species::idx
int idx
Index in all_species.
Definition: species.hpp:78

LoadBalance::verbose
bool verbose
Definition: load_balance.hpp:301

count_ptl_per_node.hpp

Plasma::for_all_nonadiabatic_species
void for_all_nonadiabatic_species(F func, DevicePtlOpt device_ptl_opt=UseDevicePtl)
Definition: plasma.hpp:120

Grid< DeviceType >

LoadRegion::prediction_undershoot
double prediction_undershoot
Definition: load_balance.hpp:34

LoadBalance::get_even_division
double get_even_division(const View< double *, HostType > &input, int n) const
Definition: load_balance.hpp:305

count_ptl_per_node_elec_main_ion
Kokkos::View< double **, Kokkos::LayoutRight, HostType > count_ptl_per_node_elec_main_ion(const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, Plasma &plasma, bool kinetic_electrons)
Definition: count_ptl_per_node.cpp:29

LoadRegion::observed_load_imbalance
double observed_load_imbalance
Definition: load_balance.hpp:33

DomainDecomposition::update_pol_decomp
void update_pol_decomp()
Definition: domain_decomposition.cpp:229

LoadBalance::update_model
void update_model(const View< int *, CLayout, HostType > &current_partition)
Definition: load_balance.hpp:487

Species::get_total_n_ptl
long long int get_total_n_ptl()
Definition: species.hpp:693

f0_redistribute
void f0_redistribute(Plasma &plasma, const DomainDecomposition< DeviceType > &pol_decomp, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, const VelocityGrid &vgrid, const View< int *, CLayout, HostType > &old_partition)
Definition: f0_redistribute.cpp:337

LoadBalance::set_new_partition
void set_new_partition(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, const VelocityGrid &vgrid, Plasma &plasma, DomainDecomposition< DeviceType > &pol_decomp, WeightingAlgorithm weighting_algorithm)
Definition: load_balance.hpp:529

TIMER
#define TIMER(N, F)
Definition: timer_macro.hpp:24

LoadBalance::rebalance
void rebalance(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, const VelocityGrid &vgrid, Plasma &plasma, DomainDecomposition< DeviceType > &pol_decomp, ReweightOption reweight_option, WeightingAlgorithm weighting_algorithm=WeightingAlgorithm::Default)
Definition: load_balance.hpp:691

NLReader::NamelistReader::use_namelist
void use_namelist(const string &namelist)
Definition: NamelistReader.hpp:355

LoadBalance::ReweightOption::Always

LoadBalance::ConstraintOption
ConstraintOption
Definition: load_balance.hpp:279

LoadBalance::update_model
void update_model(const View< int *, CLayout, HostType > &current_partition, const std::vector< double > &manual_times)
Definition: load_balance.hpp:494

LoadRegion::get_estimated_time_per_vertex
double get_estimated_time_per_vertex(int i) const
Definition: load_balance.hpp:157

LoadRegion::my_period_rank
int my_period_rank
Definition: load_balance.hpp:42

LoadBalance::ConstraintOption::ParticleCount

DomainDecomposition::nnodes_on_plane
int nnodes_on_plane
Number of nodes on local plane.
Definition: domain_decomposition.hpp:83

f0_redistribute.hpp

LoadBalance::WeightingAlgorithm::SingleRegionBalance

calculate_load_imbalance
void calculate_load_imbalance(double f0_cost)

LoadRegion::initialize_model
void initialize_model()
Definition: load_balance.hpp:198

LoadBalance::default_weighting_algorithm
WeightingAlgorithm default_weighting_algorithm
Definition: load_balance.hpp:290

LoadBalance::proposed_partition
View< int *, CLayout, HostType > proposed_partition
Which processors get which vertices.
Definition: load_balance.hpp:294

LoadRegion::get_time_since_previous_call
double get_time_since_previous_call()
Definition: load_balance.hpp:92

LoadRegion::get_model_is_initialized
bool get_model_is_initialized() const
Definition: load_balance.hpp:177

LoadRegion::UpdateMethod
UpdateMethod
Definition: load_balance.hpp:18

LoadRegion::region_name
std::string region_name
Definition: load_balance.hpp:26

LoadRegion::verbose
bool verbose
Definition: load_balance.hpp:28

view_arithmetic.hpp

LoadBalance::regions
std::vector< LoadRegion > regions
Definition: load_balance.hpp:292

NLReader::NamelistReader::namelist_present
bool namelist_present(const string &namelist)
Definition: NamelistReader.hpp:351

LoadRegion::update_method
UpdateMethod update_method
Definition: load_balance.hpp:46

LoadRegion::model_is_initialized
bool model_is_initialized
Definition: load_balance.hpp:29

Simulation::electron_on
bool electron_on
Use kinetic electrons.
Definition: sml.hpp:54

reset_cost_trackers
void reset_cost_trackers()

LoadBalance::redistribute_load
void redistribute_load(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, const VelocityGrid &vgrid, Plasma &plasma, DomainDecomposition< DeviceType > &pol_decomp, const View< int *, CLayout, HostType > &old_partition)
Definition: load_balance.hpp:566

LoadRegion::get_largest_predicted_time
double get_largest_predicted_time(const View< int *, CLayout, HostType > &proposed_partition) const
Definition: load_balance.hpp:182

LoadBalance::initialize_model
void initialize_model()
Definition: load_balance.hpp:500

sum_view
T::value_type sum_view(const T &view)
Definition: view_arithmetic.hpp:103

LoadBalance::recommend_proposed_partition
bool recommend_proposed_partition()
Definition: load_balance.hpp:449

DomainDecomposition::update_flux_surf
void update_flux_surf(const HostArray< VertexList > &surfaces)
Definition: domain_decomposition.cpp:118

timer_macro.hpp

exit_XGC
void exit_XGC(std::string msg)
Definition: globals.hpp:37

LoadBalance::greedily_fill_partition
bool greedily_fill_partition(const View< double *, HostType > &weight, const View< double *, HostType > &constraint1, double target_weight_per_rank)
Definition: load_balance.hpp:315

LoadBalance::WeightingAlgorithm::ParticleBalance

LoadRegion::n_periods
int n_periods
How many repeating periods there are; in tokamaks this is planes.
Definition: load_balance.hpp:40

magnetic_field
Definition: magnetic_field.F90:1

LoadRegion
Definition: load_balance.hpp:16

LoadBalance::print_new_partition
void print_new_partition()
Definition: load_balance.hpp:479

LoadBalance::WeightingAlgorithm
WeightingAlgorithm
Definition: load_balance.hpp:272

DomainDecomposition< DeviceType >

assess_whether_to_rebalance_load
int assess_whether_to_rebalance_load()

LoadRegion::get_estimated_time_per_vertex
View< double *, HostType > get_estimated_time_per_vertex() const
Definition: load_balance.hpp:161

Plasma
Definition: plasma.hpp:13

LoadBalance::constraint1_max
double constraint1_max
Definition: load_balance.hpp:303

set_weights
void set_weights(int *gvid0_pid, double *ptl_count, double *f0_node_cost)

count_all_ptl_per_node
View< double *, CLayout, HostType > count_all_ptl_per_node(const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, Plasma &plasma)
Definition: count_ptl_per_node.cpp:71

LoadBalance::ReweightOption
ReweightOption
Definition: load_balance.hpp:283

Plasma::f0_node_cost
View< double *, CLayout, HostType > f0_node_cost
Definition: plasma.hpp:30

shift_all_species
void shift_all_species(Plasma &plasma, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, const DomainDecomposition< DeviceType > &pol_decomp, Shift::ShiftPh0 shift_ph0)
Definition: shift.cpp:305

LoadRegion::reset_timer
void reset_timer()
Definition: load_balance.hpp:152

Species
Definition: species.hpp:75

LoadRegion::update_model
void update_model(const View< int *, CLayout, HostType > &current_partition, double manual_time=-1.0)
Definition: load_balance.hpp:205

LoadBalance::model_is_initialized
bool model_is_initialized()
Definition: load_balance.hpp:506

LoadRegion::timer_names
std::vector< std::string > timer_names
Definition: load_balance.hpp:27

Shift::NoPhase0
Definition: shift.hpp:10

Grid::flux_surfaces_h
HostArray< VertexList > flux_surfaces_h
Definition: grid.hpp:198

DomainDecomposition::pol_decomp
bool pol_decomp
Use poloidal decomposition.
Definition: domain_decomposition.hpp:76

LoadRegion::get_observed_max_region_time
double get_observed_max_region_time() const
Definition: load_balance.hpp:169

LoadBalance
Definition: load_balance.hpp:270

LoadRegion::touch_timers
void touch_timers()
Definition: load_balance.hpp:116

NoInit
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:69

LoadRegion::update_model_no_history
void update_model_no_history(const View< int *, CLayout, HostType > &current_partition, const View< double *, HostType > &all_periods_timings)
Definition: load_balance.hpp:48

LoadRegion::get_prediction_undershoot
double get_prediction_undershoot() const
Definition: load_balance.hpp:165

LoadRegion::get_observed_load_imbalance
double get_observed_load_imbalance() const
Definition: load_balance.hpp:173

LoadBalance::ReweightOption::IfBetter

Species::get_max_n_ptl
int get_max_n_ptl()
Definition: species.hpp:704

GPTLstop
static int GPTLstop(const char *name)
Definition: timer_macro.hpp:10

DomainDecomposition::gvid0_pid_h
Kokkos::View< int *, Kokkos::LayoutRight, HostType > gvid0_pid_h
Which processors get which vertices (host)
Definition: domain_decomposition.hpp:93