XGCa
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
load_balance.hpp
Go to the documentation of this file.
1 #ifndef LOAD_BALANCE_HPP
2 #define LOAD_BALANCE_HPP
3 
4 #include "timer_macro.hpp"
5 #include "shift.hpp"
6 #include "count_ptl_per_node.hpp"
7 #include "f0_redistribute.hpp"
8 
9 // Used for original fortran load balancer
10 extern "C" void calculate_load_imbalance();
11 extern "C" int assess_whether_to_rebalance_load();
12 extern "C" void set_weights(double* ptl_count);
13 
14 class LoadRegion{
15  public:
16  enum class UpdateMethod{
17  NoHistory=0
18  };
19 
20  private:
21 
22  View<double*,HostType> estimated_time_per_vertex;
23  std::string region_name;
24  bool verbose;
29 
30 #ifdef USE_MPI
31  MPI_Comm inter_period_comm;
32  MPI_Comm period_comm;
33 #endif
34  int n_periods;
37 
39 
41 
42  void update_model_no_history(const View<int*,CLayout,HostType>& current_partition, const View<double*, HostType>& all_periods_timings){
43  // Loop over ranks' timers
44  for(int i=0; i<all_periods_timings.size(); i++){
45  int node_offset = current_partition(i) - 1;
46  int next_node_offset = current_partition(i+1) - 1;
47  int nnodes = next_node_offset - node_offset;
48  // Best 0th order estimate is to assume all vertices took the same time
49  double time_per_node = all_periods_timings(i)/nnodes;
50  for(int j=0; j<nnodes; j++){
51  estimated_time_per_vertex(j + node_offset) = time_per_node;
52  }
53  }
54  }
55 
57  // Get time from camtimers (accumulated)
58  double new_accumulated_time = 0.0;
59  int ierr = GPTLget_wallclock(region_name.c_str(), -1, &new_accumulated_time);
60 
61  // If ierr != 0, timer is not yet defined, so the accumulated time is zero
62  if(ierr != 0) new_accumulated_time = 0.0;
63 
64  // Subtract previous accumulated time to get time spent since last update
65  double time_spent = new_accumulated_time - time_accumulated;
66 
67  // Save new accumulated time
68  time_accumulated = new_accumulated_time;
69 
70  return time_spent;
71  }
72 
73  public:
74 
75 #ifdef USE_MPI
76  LoadRegion(std::string region_name, int n_vertices, const MPI_Comm& inter_period_comm, const MPI_Comm& period_comm, UpdateMethod update_method, bool verbose)
77  : model_is_initialized(false),
78  verbose(verbose),
79  time_accumulated(0.0),
80  inter_period_comm(inter_period_comm),
81  period_comm(period_comm),
82  estimated_time_per_vertex("estimated_time_per_vertex",n_vertices),
84  update_method(update_method),
85  region_name(region_name)
86  {
87  // Initialize timer
88  reset_timer();
89 
90  // Get problem size from comms
91  MPI_Comm_rank(period_comm, &my_period_rank);
92  MPI_Comm_size(period_comm, &n_unique_ranks);
93  MPI_Comm_size(inter_period_comm, &n_periods);
94  }
95 
96  LoadRegion(std::string region_name, int n_vertices, const MPI_Comm& period_comm, UpdateMethod update_method, bool verbose)
97  : model_is_initialized(false),
98  verbose(verbose),
99  time_accumulated(0.0),
100  period_comm(period_comm),
101  estimated_time_per_vertex("estimated_time_per_vertex",n_vertices),
103  update_method(update_method),
104  region_name(region_name)
105  {
106  // Initialize timer
107  reset_timer();
108 
109  // Get problem size from comms
110  MPI_Comm_rank(period_comm, &my_period_rank);
111  n_periods = 1;
112  }
113 #else
114  LoadRegion(std::string region_name, int n_vertices, UpdateMethod update_method, bool verbose)
115  : model_is_initialized(false),
116  verbose(verbose),
117  time_accumulated(0.0),
118  estimated_time_per_vertex("estimated_time_per_vertex",n_vertices),
120  update_method(update_method),
121  region_name(region_name)
122  {
123  // Initialize timer
124  reset_timer();
125 
126  // Get problem size from comms
127  my_period_rank = 0;
128  n_periods = 1;
129  }
130 #endif
131 
132  void reset_timer(){
133  // Reset timer by getting the time since the previous call and suppressing the output
134  double time_spent = get_time_since_previous_call();
135  }
136 
137  double get_estimated_time_per_vertex(int i) const{
138  return estimated_time_per_vertex(i);
139  }
140 
141  View<double*,HostType> get_estimated_time_per_vertex() const{
143  }
144 
146  return prediction_undershoot;
147  }
148 
151  }
152 
154  return model_is_initialized;
155  }
156 
157  // Predict performance of new partition based on model
158  double get_largest_predicted_time(const View<int*,CLayout,HostType>& proposed_partition) const{
159  double largest_time = 0.0;
160  //int largest_time_ind = 0;
161  for(int i=0; i<(proposed_partition.size()-1); i++){
162  double proc_time = 0.0;
163  for(int i_node=proposed_partition(i)-1; i_node<proposed_partition(i+1)-1; i_node++){
164  proc_time += estimated_time_per_vertex(i_node);
165  }
166  if(proc_time>largest_time){
167  //largest_time_ind = i;
168  largest_time = proc_time;
169  }
170  }
171  return largest_time;
172  }
173 
175  reset_timer();
176 
177  // Model can now be used
178  model_is_initialized = true;
179  }
180 
181  void update_model(const View<int*,CLayout,HostType>& current_partition){
183  // Get time spent since last model update
184  double time_spent_this_rank = get_time_since_previous_call();
185 
186  // Reduce onto plane 0
187  // Could try MPI_SUM rather than MPI_MAX
188  double time_spent;
189  if(n_periods>1){
190 #ifdef USE_MPI
191  MPI_Reduce(&time_spent_this_rank, &time_spent, 1, MPI_DOUBLE, MPI_MAX, 0, inter_period_comm);
192 #endif
193  }else{
194  time_spent = time_spent_this_rank;
195  }
196 
197  // Allocate for timing of each
198  View<double*, HostType> all_periods_timings(NoInit("all_periods_timings"), n_unique_ranks);
199 #ifdef USE_MPI
200  // Gather from all ranks in plane 0
201  MPI_Gather(&time_spent, 1, MPI_DOUBLE, all_periods_timings.data(), 1, MPI_DOUBLE, 0, period_comm);
202 #else
203  all_periods_timings(0) = time_spent;
204 #endif
205 
206  if (is_rank_zero()){
207  // Look at the timing prediction made last time
209  if(verbose) printf("\nPredicted max time in this region was %1.5e", predicted_max_region_time);
210 
211  // Get max region time
213  for(int i=0; i<all_periods_timings.size(); i++){
214  observed_max_region_time = std::max(observed_max_region_time, all_periods_timings(i));
215  }
216  if(verbose) printf("\nObserved max time in this region was %1.5e", observed_max_region_time);
217 
218  // Take the max here so that there is never an assumed overshoot
219  if(predicted_max_region_time!=0.0){ // If time is zero, there hasn't been a prediction yet
221  }
222  if(verbose) printf("\nThe prediction undershot by a factor of %1.5e", observed_max_region_time/predicted_max_region_time);
223 
224  update_model_no_history(current_partition, all_periods_timings);
225  }
226  }else{
227  exit_XGC("Error: Update method not available\n");
228  }
229  }
230 };
231 
233  public:
234  enum class WeightingAlgorithm{
237  Fortran,
238  Default
239  };
240 
241  enum class ConstraintOption{
242  ParticleCount=0
243  };
244 
245  enum class ReweightOption{
246  Always=0,
247  IfBetter
248  };
249 
250  private:
251 
253 
254  std::vector<LoadRegion> regions;
255 
256  View<int*,CLayout,HostType> proposed_partition;
257 
259  bool verbose;
260 
262 
263  double get_even_division(const View<double*,HostType>& input, int n) const{
264  double total = 0.0;
265  Kokkos::parallel_reduce("sum", Kokkos::RangePolicy<HostExSpace>(0,input.size()), [=]( const int i, double& l_total){
266  l_total += input(i);
267  }, total);
268 
269  return (total/n);
270  }
271 
272 
273  bool greedily_fill_partition(const View<double*,HostType>& weight, const View<double*,HostType>& constraint1, double target_weight_per_rank){
274  int nnode = weight.size();
275  int nproc = proposed_partition.size()-1;
276 
277  // Start from rank 0 of the plane; assign nodes until it meets the desired weight per rank, then move to next rank
278  double constraint1_in_this_rank = 0.0;
279  double weight_in_this_rank = 0.0;
280  proposed_partition(0) = 1; // 1-indexed
281  int pid = 0;
282  bool assign_one_node_per_proc = false;
283  for(int i_node = 0; i_node<nnode; i_node++){
284  // Since every rank needs at last one node, switch to assigning at every
285  // iteration if there are only as many nodes left as ranks
286  if(nnode-i_node==nproc-pid) assign_one_node_per_proc = true;
287 
288  // Check if a criterion is met to consider the rank sufficiently loaded
289  bool rank_is_loaded = false;
290 
291  // Criterion 1: We are out of nodes
292  if(assign_one_node_per_proc) rank_is_loaded = true;
293 
294  // Criterion 2: We have hit a constraint
295  if(constraint1_in_this_rank>=constraint1_max) rank_is_loaded = true;
296 
297  // Criterion 3: We have loaded the rank with an even amount of work
298  if(weight_in_this_rank>=target_weight_per_rank) rank_is_loaded = true;
299 
300  // If there are enough particles assigned to this rank, move to the next rank
301  if(rank_is_loaded){
302  proposed_partition(pid+1) = i_node+1; // 1-indexed
303  if(verbose) printf("\nRank %d is loaded (Nodes %d-%d); weight=%1.4e, ptl*nplanes=%d", pid, proposed_partition(pid), proposed_partition(pid+1)-1, weight_in_this_rank, int(constraint1_in_this_rank));
304  constraint1_in_this_rank = 0.0;
305  weight_in_this_rank = 0.0;
306  pid++;
307  if(pid==nproc-1) break;
308  }
309  constraint1_in_this_rank += constraint1(i_node);
310  weight_in_this_rank += weight(i_node);
311  }
312 
313  // Last value will always be nnode+1
314  proposed_partition(nproc) = nnode+1;
315 
316  // Check n_ptl in final rank
317  constraint1_in_this_rank = 0.0;
318  for(int i_node=proposed_partition(nproc-1)-1; i_node<nnode; i_node++){
319  constraint1_in_this_rank += constraint1(i_node);
320  }
321  if(constraint1_in_this_rank>=constraint1_max){
322  // Return false if the constraint was not met
323  return false;
324  }else{
325  return true;
326  }
327  }
328 
329  // Predict performance of new partition based on model
330  double get_largest_predicted_time(const View<int*,CLayout,HostType>& partition, const View<double*,HostType>& weight) const{
331  double largest_time = 0.0;
332  //int largest_time_ind = 0;
333  for(int i=0; i<(partition.size()-1); i++){
334  double proc_time = 0.0;
335  for(int i_node=partition(i)-1; i_node<partition(i+1)-1; i_node++){
336  proc_time += weight(i_node);
337  }
338  if(proc_time>largest_time){
339  //largest_time_ind = i;
340  largest_time = proc_time;
341  }
342  }
343  return largest_time;
344  }
345 
346  void one_weight_balance(const View<double*,HostType>& weight, const View<double*,CLayout,HostType> constraint1){
347  int nproc = proposed_partition.size()-1;
348 
349  // Ideally, each rank would get the same amount of work, i.e. the average
350  double ideal_weight_per_rank = get_even_division(weight, nproc);
351 
352  // Make initial attempt, targeting even distribution of work
353  bool meets_constraints = greedily_fill_partition(weight, constraint1, ideal_weight_per_rank);
354 
355  if(!meets_constraints){
356  // An equal work distribution cannot be assigned due to a constraint.
357  // First, determine the weights if only the constraint were followed
358  meets_constraints = greedily_fill_partition(constraint1, constraint1, get_even_division(constraint1, nproc));
359  if(!meets_constraints) exit_XGC("\nUnexpected issue in load balance: constraint-based partition doesn't satisfy constraints\n");
360 
361  // This partition meets the constraints, but the proposed timing is likely unacceptable
362  double upper_limit_weight_per_rank = get_largest_predicted_time(proposed_partition, weight);
363  if(verbose) printf("The ideal distribution (%1.3e) does not satisfy constraints. This distribution (%1.3e) does.\n", ideal_weight_per_rank, upper_limit_weight_per_rank);
364 
365  // Bisect the difference between the original target time and the upper limit, and see if that satisfies the constraints
366  // Continue until the load balance has been minimized to a precision based on the the original desired time
367  const double desired_precision = 0.01; // Fraction of imbalance that's worth even checking whether it can be removed
368  double desired_step_size = ideal_weight_per_rank*desired_precision;
369  double step_size = upper_limit_weight_per_rank - ideal_weight_per_rank;
370  double compromise_weight_per_rank = upper_limit_weight_per_rank;
371  if(verbose) printf("\nEmploying a binary search to narrow down on the best option.");
372  while(step_size>desired_step_size){
373  // Halve the size of the step
374  step_size /= 2;
375 
376  if(meets_constraints){
377  // Try reducing the load imbalance since constraints were met
378  compromise_weight_per_rank -= step_size;
379  }else{
380  // Try raising the load imbalance since constraints were broken
381  compromise_weight_per_rank += step_size;
382  }
383 
384  // Get new partition
385  meets_constraints = greedily_fill_partition(weight, constraint1, compromise_weight_per_rank);
386  if(verbose) printf("\n Stepped by %1.3e to %1.3e. The new partition does%s meet constraints.", step_size, compromise_weight_per_rank, meets_constraints?"" : "nt");
387  }
388  // In case we end at a partition that fails
389  while(!meets_constraints){
390  compromise_weight_per_rank += step_size;
391  meets_constraints = greedily_fill_partition(weight, constraint1, compromise_weight_per_rank);
392  if(verbose) printf("\n Stepped by %1.3e UP to %1.3e. The new partition does%s meet constraints.", step_size, compromise_weight_per_rank, meets_constraints?"" : "nt");
393  }
394  if(verbose) printf("\n");
395  }
396  }
397 
398  // TODO
399  // Evaluate overall model performance
400  // MAIN_LOOP time vs sum of regions (% coverage)
401  //
402 
403  /* This function assesses the proposed partition and decides whether it is worth switching to.
404  * The simplest formulation is to recommend the new partition if the new partition is predicted to be faster
405  * than the existing one. There is also a factor to account for historical inaccuracy of the previous estimate.
406  * A future option would be to require a certain level of improvement before repartitioning. */
408  // Total time spent in regions (assuming that global barriers demarcate them):
409  double observed_max_all_rgns_time = 0.0;
410  for(int i=0; i<regions.size(); i++){
411  observed_max_all_rgns_time += regions[i].get_observed_max_region_time();
412  }
413 
414  // Total predicted time, accounting for observed prediction undershoot for each region
415  double predicted_max_all_rgns_time = 0.0;
416  for(int i=0; i<regions.size(); i++){
417  predicted_max_all_rgns_time += regions[i].get_largest_predicted_time(proposed_partition)*regions[i].get_prediction_undershoot();
418  }
419 
420  double fractional_improvement = 1.0 - predicted_max_all_rgns_time/observed_max_all_rgns_time;
421 
422  if(verbose){
423  printf("\nDetermining whether to adopt the proposed partition:");
424  printf("\n Observed time with current partition was: %1.3e", observed_max_all_rgns_time);
425  printf("\n Predicted time with proposed partition, adjusting for historical undershoot (of Rgn 0: %1.3e): %1.3e", regions[0].get_prediction_undershoot(), predicted_max_all_rgns_time);
426  printf("\n Adopted if Fractional improvement of adjusted prediction (%1.3e) exceeds specified threshold (%1.3e)\n", fractional_improvement, threshold_to_rebalance);
427  }
428 
429  // Recommend the new partition if the predicted time is less than the time observed with the current partition
430  if(fractional_improvement > threshold_to_rebalance){
431  return true;
432  }else{
433  return false;
434  }
435  }
436 
438  // Print new partition
439  for(int i = 0; i<proposed_partition.size()-1; i++){
440  int nnodes_for_proc = proposed_partition(i+1) - proposed_partition(i);
441  printf("process %d : nnodes = %d; range = (%d - %d)\n", i, nnodes_for_proc, proposed_partition(i), proposed_partition(i+1)-1);
442  }
443  }
444 
445  void update_model(const View<int*,CLayout,HostType>& current_partition){
446  for(int i=0; i<regions.size(); i++){
447  regions[i].update_model(current_partition);
448  }
449  }
450 
452  for(int i=0; i<regions.size(); i++){
453  regions[i].initialize_model();
454  }
455  }
456 
458  bool is_initialized=true;
459  for(int i=0; i<regions.size(); i++){
460  is_initialized = (is_initialized && regions[i].get_model_is_initialized());
461  }
462  return is_initialized;
463  }
464 
465  void propose_new_partition(const Kokkos::View<double*,Kokkos::LayoutRight,HostType>& ptl_count, WeightingAlgorithm weighting_algorithm){
466  auto constraint1 = ptl_count; // Constrain based on particle count (memory)
467  if(weighting_algorithm == WeightingAlgorithm::SingleRegionBalance){
468  if(regions.size()!=1) exit_XGC("Error: Load balancing is currently single-region only\n");
469  one_weight_balance(regions[0].get_estimated_time_per_vertex(), constraint1);
470  }else if(weighting_algorithm == WeightingAlgorithm::ParticleBalance){
471  one_weight_balance(ptl_count, constraint1);
472  }
473 
474  if(verbose){
475  printf("\nNewly PROPOSED partition:\n");
477  }
478  }
479 
481  const VelocityGrid& vgrid, Plasma& plasma, DomainDecomposition<DeviceType>& pol_decomp, WeightingAlgorithm weighting_algorithm){
482  if(weighting_algorithm==WeightingAlgorithm::Fortran){
483  // In the fortran algorithm the new distribution is calculated and set here
484  // Count number of particles belonging to each node
485  Kokkos::View<double**,Kokkos::LayoutRight,HostType> ptl_count = count_ptl_per_node_elec_main_ion(grid, magnetic_field, plasma, sml.electron_on);
486 
487  TIMER("SET_WEIGHTS",
488  set_weights(ptl_count.data()) );
489  }else{
490 #ifdef USE_MPI
491  // Broadcast proposal to all ranks
492  MPI_Bcast(proposed_partition.data(), proposed_partition.size(), MPI_INT, 0, SML_COMM_WORLD);
493 #endif
494 
495  // Copy proposal to pol_decomp
496  Kokkos::deep_copy(pol_decomp.gvid0_pid_h, proposed_partition);
497 
498  if(is_rank_zero()){
499  printf("\nNEW PARTITION:\n");
501  }
502  }
503 
504  // Update bounds and device copy of the decomposition array
505  pol_decomp.update_pol_decomp();
506  }
507 
509  const VelocityGrid& vgrid, Plasma& plasma, DomainDecomposition<DeviceType>& pol_decomp,
510  const View<int*,CLayout,HostType>& old_partition){
511  if (sml.f0_grid){
512  // Move f information to correct MPI rank after decomposition update
513  TIMER("F0_REDISTRIBUTE",
514  f0_redistribute(plasma, pol_decomp, vgrid, old_partition) );
515  }
516 
517  // Shift particles to correct MPI rank after decomposition update
518  TIMER("SHIFT_R",
519  shift_all_species(plasma, grid, magnetic_field, pol_decomp, Shift::NoPhase0) );
520  }
521 
522  bool will_rebalance(ReweightOption reweight_option, WeightingAlgorithm weighting_algorithm){
523  if(reweight_option==ReweightOption::Always){
524  // No need to check if model is an improvement
525  return true;
526  }else{
527  if(weighting_algorithm==WeightingAlgorithm::Fortran){
528  // Calculate load imbalance
530 
531  // Assess if load imbalance merits a rebalancing
532  int should_rebalance_int = assess_whether_to_rebalance_load();
533  return (should_rebalance_int==1);
534  }else{
535  // Evaluate the proposed partition on rank 0
536  bool should_rebalance;
537  if(weighting_algorithm==WeightingAlgorithm::ParticleBalance){
538  // Since ParticleBalance doesn't contain an internal model, cant evaluate the partition
539  // so just always approve, for now.
540  // Probably better to have the model updatable by something other than time so it can still
541  // be assessed in this mode
542  return true;
543  }else{
544  if(is_rank_zero()) should_rebalance = recommend_proposed_partition();
545  // Convert to int because MPI_C_BOOL documentation is confusing
546  int should_rebalance_int = (should_rebalance ? 1 : 0);
547 #ifdef USE_MPI
548  MPI_Bcast(&should_rebalance_int, 1, MPI_INT, 0, SML_COMM_WORLD);
549 #endif
550  return (should_rebalance_int==1);
551  }
552  }
553  }
554  }
555 
556  public:
557 
559  : proposed_partition(NoInit("proposed_partition"), pol_decomp.gvid0_pid_h.layout())
560  {
561  double max_mem_redist_gb = 10.0;
562  std::string weighting_algorithm_str = "Fortran";
563  if(nlr.namelist_present("load_balance_param")){
564  nlr.use_namelist("load_balance_param");
565  max_mem_redist_gb = nlr.get<double>("max_mem_redist_gb", 10.0);
566  weighting_algorithm_str = nlr.get<std::string>("weighting_algorithm", "Fortran");
567  threshold_to_rebalance = nlr.get<double>("threshold_to_rebalance", 0.02);
568  verbose = nlr.get<bool>("verbose", false);
569  }else{
570  threshold_to_rebalance = 0.02;
571  verbose = false;
572  }
573 
574  // Simple model for memory usage from load balance - just particles
575  double max_n_ptl_on_rank = max_mem_redist_gb*1024*1024*1024/80.0; // ~ 80 B per particle
576 
577  if(weighting_algorithm_str=="Fortran"){
579  }else if(weighting_algorithm_str=="ParticleBalance"){
581  }else if(weighting_algorithm_str=="SingleRegionBalance"){
583  }else{
584  exit_XGC("\nError: weighting_algorithm input not valid.");
585  }
586 
587  // Set up load balance regions
590  int n_regions = 1;
591  std::string region_name = "ipc2:PUSHE";
592 
593  // Temporary
594  GPTLstart(region_name.c_str());
595  GPTLstop(region_name.c_str());
596 
597  int grid_nnode = pol_decomp.gvid0_pid_h(pol_decomp.gvid0_pid_h.size()-1) - 1;
598 
599  for(int i=0; i<n_regions; i++){
600 #ifdef USE_MPI
601  regions.push_back(LoadRegion(region_name, grid_nnode, pol_decomp.mpi.intpl_comm, pol_decomp.mpi.plane_comm, update_method, verbose));
602 #else
603  regions.push_back(LoadRegion(region_name, grid_nnode, update_method, verbose));
604 #endif
605  }
606  }
607 
608  // Hard-code for now
610 
611  if(constraint_option == ConstraintOption::ParticleCount){
612 #ifdef USE_MPI
613  // Normalize by n_planes since we are using the sum of planes as the constraint
614  constraint1_max = max_n_ptl_on_rank*pol_decomp.mpi.n_intpl_ranks;
615 #else
616  constraint1_max = max_n_ptl_on_rank;
617 #endif
618  }else{
619  exit_XGC("\nError: ParticleConstraint is only available ConstraintOption.\n");
620  }
621  }
622 
624  const VelocityGrid& vgrid, Plasma& plasma, DomainDecomposition<DeviceType>& pol_decomp, ReweightOption reweight_option,
625  WeightingAlgorithm weighting_algorithm = WeightingAlgorithm::Default){
626  if(weighting_algorithm == WeightingAlgorithm::Default){
627  weighting_algorithm = default_weighting_algorithm;
628  }
629 
631  // Override input algorithm; if the default is Fortran, it should always use the fortran load balancer
632  weighting_algorithm = default_weighting_algorithm;
633  }
634 
635  if(is_rank_zero() && verbose){
636  if(weighting_algorithm==WeightingAlgorithm::Fortran) printf("\nLoad balance called with Fortran algorithm\n");
637  else if(weighting_algorithm==WeightingAlgorithm::ParticleBalance) printf("\nLoad balance called with Ptl algorithm\n");
638  else if(weighting_algorithm==WeightingAlgorithm::SingleRegionBalance) printf("\nLoad balance called with SingleRegion algorithm\n");
639  }
640 
641  GPTLstart("REBALANCE");
642  if(weighting_algorithm!=WeightingAlgorithm::Fortran){
643  GPTLstart("PTL_COUNT");
645  long long int total_n_ptl = species.get_total_n_ptl();
646  int max_n_ptl = species.get_max_n_ptl();
647 #ifdef USE_MPI
648  double avg_n_ptl = (double)(total_n_ptl)/pol_decomp.mpi.nranks;
649 #else
650  double avg_n_ptl = total_n_ptl;
651 #endif
652  double max_ratio = max_n_ptl/avg_n_ptl;
653  if(is_rank_zero()) printf(" Species %d: (max/avg ptl per rank = %1.2f); total n_ptl = %lld\n", species.idx, max_ratio, total_n_ptl);
655  GPTLstop("PTL_COUNT");
656 
657  GPTLstart("PTL_COUNT_PER_NODE");
658  auto ptl_count = count_all_ptl_per_node(grid, magnetic_field, plasma);
659  GPTLstop("PTL_COUNT_PER_NODE");
660 
661  if(weighting_algorithm!=WeightingAlgorithm::ParticleBalance){
662  if(model_is_initialized()){
663  // Update the model with the latest timing
664  TIMER("LOAD_BAL_UPDATE",
665  update_model(pol_decomp.gvid0_pid_h) );
666  }else{
667  if(is_rank_zero() && verbose) printf("Initializing timing model, no partition proposal yet.");
669  GPTLstop("REBALANCE");
670  return;
671  }
672  }
673 
674  // With the updated model, proposed a new partition
675  GPTLstart("LOAD_BAL_NEW_PART");
676  if (is_rank_zero()) propose_new_partition(ptl_count, weighting_algorithm);
677  GPTLstop("LOAD_BAL_NEW_PART");
678  }
679 
680  GPTLstart("LOAD_BAL_REBAL");
681  if(will_rebalance(reweight_option, weighting_algorithm)){
682  // Save the old partition, it is used to save some communication in the f0 redistribution
683  View<int*,CLayout,HostType> old_partition(NoInit("old_partition"), pol_decomp.gvid0_pid_h.layout());
684  Kokkos::deep_copy(old_partition, pol_decomp.gvid0_pid_h);
685 
686  // Update pol_decomp
687  TIMER("LOAD_BAL_SET_NEW",
688  set_new_partition(sml, grid, magnetic_field, vgrid, plasma, pol_decomp, weighting_algorithm) );
689 
690  // Update particles and f0
691  TIMER("LOAD_BAL_REDIST",
692  redistribute_load(sml, grid, magnetic_field, vgrid, plasma, pol_decomp, old_partition) );
693  }
694  GPTLstop("LOAD_BAL_REBAL");
695  GPTLstop("REBALANCE");
696  }
697 };
698 
699 #endif
View< double *, HostType > estimated_time_per_vertex
Definition: load_balance.hpp:22
static int GPTLstart(const char *name)
Definition: timer_macro.hpp:9
bool is_rank_zero()
Definition: globals.hpp:26
void one_weight_balance(const View< double *, HostType > &weight, const View< double *, CLayout, HostType > constraint1)
Definition: load_balance.hpp:346
MPI_Comm SML_COMM_WORLD
Definition: my_mpi.cpp:4
double threshold_to_rebalance
Definition: load_balance.hpp:258
T get(const string &param, const T default_val, int val_ind=0)
Definition: NamelistReader.hpp:372
double time_accumulated
Definition: load_balance.hpp:38
double predicted_max_region_time
Definition: load_balance.hpp:26
double observed_max_region_time
Definition: load_balance.hpp:27
double get_largest_predicted_time(const View< int *, CLayout, HostType > &partition, const View< double *, HostType > &weight) const
Definition: load_balance.hpp:330
Definition: velocity_grid.hpp:7
Definition: sml.hpp:8
int n_unique_ranks
How many ranks are in a &#39;period&#39; (in tokamaks, in a plane)
Definition: load_balance.hpp:35
subroutine plasma(grid, itr, p, dene_out, deni_out, Te_out, Ti_out, Vparai_out)
Calculate the plasma density, temperature, and parallel velocity for a point in triangle itr using pl...
Definition: neutral_totalf.F90:1235
Definition: plasma.hpp:144
Definition: NamelistReader.hpp:193
Definition: magnetic_field.hpp:12
void propose_new_partition(const Kokkos::View< double *, Kokkos::LayoutRight, HostType > &ptl_count, WeightingAlgorithm weighting_algorithm)
Definition: load_balance.hpp:465
int idx
Index in all_species.
Definition: species.hpp:75
bool verbose
Definition: load_balance.hpp:259
void for_all_nonadiabatic_species(F func, DevicePtlOpt device_ptl_opt=UseDevicePtl)
Definition: plasma.hpp:163
double prediction_undershoot
Definition: load_balance.hpp:28
double get_even_division(const View< double *, HostType > &input, int n) const
Definition: load_balance.hpp:263
void set_weights(double *ptl_count)
Kokkos::View< double **, Kokkos::LayoutRight, HostType > count_ptl_per_node_elec_main_ion(const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, Plasma &plasma, bool kinetic_electrons)
Definition: count_ptl_per_node.cpp:29
void update_pol_decomp()
Definition: domain_decomposition.tpp:96
void update_model(const View< int *, CLayout, HostType > &current_partition)
Definition: load_balance.hpp:445
long long int get_total_n_ptl()
Definition: species.hpp:680
void f0_redistribute(Plasma &plasma, DomainDecomposition< DeviceType > &pol_decomp, const VelocityGrid &vgrid, const View< int *, CLayout, HostType > &old_partition)
Definition: f0_redistribute.cpp:8
void set_new_partition(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, const VelocityGrid &vgrid, Plasma &plasma, DomainDecomposition< DeviceType > &pol_decomp, WeightingAlgorithm weighting_algorithm)
Definition: load_balance.hpp:480
#define TIMER(N, F)
Definition: timer_macro.hpp:24
void calculate_load_imbalance()
bool will_rebalance(ReweightOption reweight_option, WeightingAlgorithm weighting_algorithm)
Definition: load_balance.hpp:522
void rebalance(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, const VelocityGrid &vgrid, Plasma &plasma, DomainDecomposition< DeviceType > &pol_decomp, ReweightOption reweight_option, WeightingAlgorithm weighting_algorithm=WeightingAlgorithm::Default)
Definition: load_balance.hpp:623
void use_namelist(const string &namelist)
Definition: NamelistReader.hpp:354
ConstraintOption
Definition: load_balance.hpp:241
double get_estimated_time_per_vertex(int i) const
Definition: load_balance.hpp:137
int my_period_rank
Definition: load_balance.hpp:36
void initialize_model()
Definition: load_balance.hpp:174
WeightingAlgorithm default_weighting_algorithm
Definition: load_balance.hpp:252
View< int *, CLayout, HostType > proposed_partition
Which processors get which vertices.
Definition: load_balance.hpp:256
double get_time_since_previous_call()
Definition: load_balance.hpp:56
bool get_model_is_initialized() const
Definition: load_balance.hpp:153
UpdateMethod
Definition: load_balance.hpp:16
std::string region_name
Definition: load_balance.hpp:23
bool verbose
Definition: load_balance.hpp:24
bool f0_grid
Whether to use f0 grid.
Definition: sml.hpp:55
std::vector< LoadRegion > regions
Definition: load_balance.hpp:254
bool namelist_present(const string &namelist)
Definition: NamelistReader.hpp:350
UpdateMethod update_method
Definition: load_balance.hpp:40
bool model_is_initialized
Definition: load_balance.hpp:25
bool electron_on
Use kinetic electrons.
Definition: sml.hpp:54
void redistribute_load(const Simulation< DeviceType > &sml, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, const VelocityGrid &vgrid, Plasma &plasma, DomainDecomposition< DeviceType > &pol_decomp, const View< int *, CLayout, HostType > &old_partition)
Definition: load_balance.hpp:508
double get_largest_predicted_time(const View< int *, CLayout, HostType > &proposed_partition) const
Definition: load_balance.hpp:158
void initialize_model()
Definition: load_balance.hpp:451
bool recommend_proposed_partition()
Definition: load_balance.hpp:407
void exit_XGC(std::string msg)
Definition: globals.hpp:36
bool greedily_fill_partition(const View< double *, HostType > &weight, const View< double *, HostType > &constraint1, double target_weight_per_rank)
Definition: load_balance.hpp:273
int n_periods
How many repeating periods there are; in tokamaks this is planes.
Definition: load_balance.hpp:34
Definition: magnetic_field.F90:1
Definition: load_balance.hpp:14
void print_new_partition()
Definition: load_balance.hpp:437
WeightingAlgorithm
Definition: load_balance.hpp:234
int assess_whether_to_rebalance_load()
LoadBalance(NLReader::NamelistReader &nlr, const DomainDecomposition< DeviceType > &pol_decomp)
Definition: load_balance.hpp:558
LoadRegion(std::string region_name, int n_vertices, UpdateMethod update_method, bool verbose)
Definition: load_balance.hpp:114
View< double *, HostType > get_estimated_time_per_vertex() const
Definition: load_balance.hpp:141
Definition: plasma.hpp:15
double constraint1_max
Definition: load_balance.hpp:261
View< double *, CLayout, HostType > count_all_ptl_per_node(const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, Plasma &plasma)
Definition: count_ptl_per_node.cpp:71
ReweightOption
Definition: load_balance.hpp:245
void shift_all_species(Plasma &plasma, const Grid< DeviceType > &grid, const MagneticField< DeviceType > &magnetic_field, const DomainDecomposition< DeviceType > &pol_decomp, Shift::ShiftPh0 shift_ph0)
Definition: shift.cpp:301
void reset_timer()
Definition: load_balance.hpp:132
Definition: species.hpp:72
bool model_is_initialized()
Definition: load_balance.hpp:457
void update_model(const View< int *, CLayout, HostType > &current_partition)
Definition: load_balance.hpp:181
Definition: shift.hpp:10
double get_observed_max_region_time() const
Definition: load_balance.hpp:149
Definition: load_balance.hpp:232
Kokkos::ViewAllocateWithoutInitializing NoInit
Definition: space_settings.hpp:68
void update_model_no_history(const View< int *, CLayout, HostType > &current_partition, const View< double *, HostType > &all_periods_timings)
Definition: load_balance.hpp:42
double get_prediction_undershoot() const
Definition: load_balance.hpp:145
int get_max_n_ptl()
Definition: species.hpp:691
static int GPTLstop(const char *name)
Definition: timer_macro.hpp:10
Kokkos::View< int *, Kokkos::LayoutRight, HostType > gvid0_pid_h
Which processors get which vertices (host)
Definition: domain_decomposition.hpp:43