LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  * it may change values between parallel regions. __kmp_max_nth
18  * is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 #include "kmp.h"
22 #include "kmp_error.h"
23 #include "kmp_i18n.h"
24 #include "kmp_itt.h"
25 #include "kmp_stats.h"
26 #include "kmp_str.h"
27 #if KMP_USE_X87CONTROL
28 #include <float.h>
29 #endif
30 #include "kmp_lock.h"
31 #include "kmp_dispatch.h"
32 #if KMP_USE_HIER_SCHED
33 #include "kmp_dispatch_hier.h"
34 #endif
35 
36 #if OMPT_SUPPORT
37 #include "ompt-specific.h"
38 #endif
39 
40 /* ------------------------------------------------------------------------ */
41 /* ------------------------------------------------------------------------ */
42 
43 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
44  kmp_info_t *th;
45 
46  KMP_DEBUG_ASSERT(gtid_ref);
47 
48  if (__kmp_env_consistency_check) {
49  th = __kmp_threads[*gtid_ref];
50  if (th->th.th_root->r.r_active &&
51  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
52 #if KMP_USE_DYNAMIC_LOCK
53  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
54 #else
55  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
56 #endif
57  }
58  }
59 }
60 
61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
62  kmp_info_t *th;
63 
64  if (__kmp_env_consistency_check) {
65  th = __kmp_threads[*gtid_ref];
66  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
67  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
68  }
69  }
70 }
71 
72 // Initialize a dispatch_private_info_template<T> buffer for a particular
73 // type of schedule,chunk. The loop description is found in lb (lower bound),
74 // ub (upper bound), and st (stride). nproc is the number of threads relevant
75 // to the scheduling (often the number of threads in a team, but not always if
76 // hierarchical scheduling is used). tid is the id of the thread calling
77 // the function within the group of nproc threads. It will have a value
78 // between 0 and nproc - 1. This is often just the thread id within a team, but
79 // is not necessarily the case when using hierarchical scheduling.
80 // loc is the source file location of the corresponding loop
81 // gtid is the global thread id
82 template <typename T>
83 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
84  dispatch_private_info_template<T> *pr,
85  enum sched_type schedule, T lb, T ub,
86  typename traits_t<T>::signed_t st,
87 #if USE_ITT_BUILD
88  kmp_uint64 *cur_chunk,
89 #endif
90  typename traits_t<T>::signed_t chunk,
91  T nproc, T tid) {
92  typedef typename traits_t<T>::unsigned_t UT;
93  typedef typename traits_t<T>::floating_t DBL;
94 
95  int active;
96  T tc;
97  kmp_info_t *th;
98  kmp_team_t *team;
99 
100 #ifdef KMP_DEBUG
101  typedef typename traits_t<T>::signed_t ST;
102  {
103  char *buff;
104  // create format specifiers before the debug output
105  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
106  "pr:%%p lb:%%%s ub:%%%s st:%%%s "
107  "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
108  traits_t<T>::spec, traits_t<T>::spec,
109  traits_t<ST>::spec, traits_t<ST>::spec,
110  traits_t<T>::spec, traits_t<T>::spec);
111  KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
112  __kmp_str_free(&buff);
113  }
114 #endif
115  /* setup data */
116  th = __kmp_threads[gtid];
117  team = th->th.th_team;
118  active = !team->t.t_serialized;
119 
120 #if USE_ITT_BUILD
121  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
122  __kmp_forkjoin_frames_mode == 3 &&
123  KMP_MASTER_GTID(gtid) &&
124 #if OMP_40_ENABLED
125  th->th.th_teams_microtask == NULL &&
126 #endif
127  team->t.t_active_level == 1;
128 #endif
129 #if (KMP_STATIC_STEAL_ENABLED)
130  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
131  // AC: we now have only one implementation of stealing, so use it
132  schedule = kmp_sch_static_steal;
133  else
134 #endif
135  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
136 
137  /* Pick up the nomerge/ordered bits from the scheduling type */
138  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
139  pr->flags.nomerge = TRUE;
140  schedule =
141  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
142  } else {
143  pr->flags.nomerge = FALSE;
144  }
145  pr->type_size = traits_t<T>::type_size; // remember the size of variables
146  if (kmp_ord_lower & schedule) {
147  pr->flags.ordered = TRUE;
148  schedule =
149  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
150  } else {
151  pr->flags.ordered = FALSE;
152  }
153 
154  if (schedule == kmp_sch_static) {
155  schedule = __kmp_static;
156  } else {
157  if (schedule == kmp_sch_runtime) {
158  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
159  // not specified)
160  schedule = team->t.t_sched.r_sched_type;
161  // Detail the schedule if needed (global controls are differentiated
162  // appropriately)
163  if (schedule == kmp_sch_guided_chunked) {
164  schedule = __kmp_guided;
165  } else if (schedule == kmp_sch_static) {
166  schedule = __kmp_static;
167  }
168  // Use the chunk size specified by OMP_SCHEDULE (or default if not
169  // specified)
170  chunk = team->t.t_sched.chunk;
171 #if USE_ITT_BUILD
172  if (cur_chunk)
173  *cur_chunk = chunk;
174 #endif
175 #ifdef KMP_DEBUG
176  {
177  char *buff;
178  // create format specifiers before the debug output
179  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
180  "schedule:%%d chunk:%%%s\n",
181  traits_t<ST>::spec);
182  KD_TRACE(10, (buff, gtid, schedule, chunk));
183  __kmp_str_free(&buff);
184  }
185 #endif
186  } else {
187  if (schedule == kmp_sch_guided_chunked) {
188  schedule = __kmp_guided;
189  }
190  if (chunk <= 0) {
191  chunk = KMP_DEFAULT_CHUNK;
192  }
193  }
194 
195  if (schedule == kmp_sch_auto) {
196  // mapping and differentiation: in the __kmp_do_serial_initialize()
197  schedule = __kmp_auto;
198 #ifdef KMP_DEBUG
199  {
200  char *buff;
201  // create format specifiers before the debug output
202  buff = __kmp_str_format(
203  "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
204  "schedule:%%d chunk:%%%s\n",
205  traits_t<ST>::spec);
206  KD_TRACE(10, (buff, gtid, schedule, chunk));
207  __kmp_str_free(&buff);
208  }
209 #endif
210  }
211 
212  /* guided analytical not safe for too many threads */
213  if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
214  schedule = kmp_sch_guided_iterative_chunked;
215  KMP_WARNING(DispatchManyThreads);
216  }
217 #if OMP_45_ENABLED
218  if (schedule == kmp_sch_runtime_simd) {
219  // compiler provides simd_width in the chunk parameter
220  schedule = team->t.t_sched.r_sched_type;
221  // Detail the schedule if needed (global controls are differentiated
222  // appropriately)
223  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
224  schedule == __kmp_static) {
225  schedule = kmp_sch_static_balanced_chunked;
226  } else {
227  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
228  schedule = kmp_sch_guided_simd;
229  }
230  chunk = team->t.t_sched.chunk * chunk;
231  }
232 #if USE_ITT_BUILD
233  if (cur_chunk)
234  *cur_chunk = chunk;
235 #endif
236 #ifdef KMP_DEBUG
237  {
238  char *buff;
239  // create format specifiers before the debug output
240  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
241  " chunk:%%%s\n",
242  traits_t<ST>::spec);
243  KD_TRACE(10, (buff, gtid, schedule, chunk));
244  __kmp_str_free(&buff);
245  }
246 #endif
247  }
248 #endif // OMP_45_ENABLED
249  pr->u.p.parm1 = chunk;
250  }
251  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
252  "unknown scheduling type");
253 
254  pr->u.p.count = 0;
255 
256  if (__kmp_env_consistency_check) {
257  if (st == 0) {
258  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
259  (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
260  }
261  }
262  // compute trip count
263  if (st == 1) { // most common case
264  if (ub >= lb) {
265  tc = ub - lb + 1;
266  } else { // ub < lb
267  tc = 0; // zero-trip
268  }
269  } else if (st < 0) {
270  if (lb >= ub) {
271  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
272  // where the division needs to be unsigned regardless of the result type
273  tc = (UT)(lb - ub) / (-st) + 1;
274  } else { // lb < ub
275  tc = 0; // zero-trip
276  }
277  } else { // st > 0
278  if (ub >= lb) {
279  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
280  // where the division needs to be unsigned regardless of the result type
281  tc = (UT)(ub - lb) / st + 1;
282  } else { // ub < lb
283  tc = 0; // zero-trip
284  }
285  }
286 
287  pr->u.p.lb = lb;
288  pr->u.p.ub = ub;
289  pr->u.p.st = st;
290  pr->u.p.tc = tc;
291 
292 #if KMP_OS_WINDOWS
293  pr->u.p.last_upper = ub + st;
294 #endif /* KMP_OS_WINDOWS */
295 
296  /* NOTE: only the active parallel region(s) has active ordered sections */
297 
298  if (active) {
299  if (pr->flags.ordered) {
300  pr->ordered_bumped = 0;
301  pr->u.p.ordered_lower = 1;
302  pr->u.p.ordered_upper = 0;
303  }
304  }
305 
306  switch (schedule) {
307 #if (KMP_STATIC_STEAL_ENABLED)
308  case kmp_sch_static_steal: {
309  T ntc, init;
310 
311  KD_TRACE(100,
312  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
313  gtid));
314 
315  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
316  if (nproc > 1 && ntc >= nproc) {
317  KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
318  T id = tid;
319  T small_chunk, extras;
320 
321  small_chunk = ntc / nproc;
322  extras = ntc % nproc;
323 
324  init = id * small_chunk + (id < extras ? id : extras);
325  pr->u.p.count = init;
326  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
327 
328  pr->u.p.parm2 = lb;
329  // pr->pfields.parm3 = 0; // it's not used in static_steal
330  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
331  pr->u.p.st = st;
332  if (traits_t<T>::type_size > 4) {
333  // AC: TODO: check if 16-byte CAS available and use it to
334  // improve performance (probably wait for explicit request
335  // before spending time on this).
336  // For now use dynamically allocated per-thread lock,
337  // free memory in __kmp_dispatch_next when status==0.
338  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
339  th->th.th_dispatch->th_steal_lock =
340  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
341  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
342  }
343  break;
344  } else {
345  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
346  "kmp_sch_static_balanced\n",
347  gtid));
348  schedule = kmp_sch_static_balanced;
349  /* too few iterations: fall-through to kmp_sch_static_balanced */
350  } // if
351  /* FALL-THROUGH to static balanced */
352  } // case
353 #endif
354  case kmp_sch_static_balanced: {
355  T init, limit;
356 
357  KD_TRACE(
358  100,
359  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
360  gtid));
361 
362  if (nproc > 1) {
363  T id = tid;
364 
365  if (tc < nproc) {
366  if (id < tc) {
367  init = id;
368  limit = id;
369  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
370  } else {
371  pr->u.p.count = 1; /* means no more chunks to execute */
372  pr->u.p.parm1 = FALSE;
373  break;
374  }
375  } else {
376  T small_chunk = tc / nproc;
377  T extras = tc % nproc;
378  init = id * small_chunk + (id < extras ? id : extras);
379  limit = init + small_chunk - (id < extras ? 0 : 1);
380  pr->u.p.parm1 = (id == nproc - 1);
381  }
382  } else {
383  if (tc > 0) {
384  init = 0;
385  limit = tc - 1;
386  pr->u.p.parm1 = TRUE;
387  } else {
388  // zero trip count
389  pr->u.p.count = 1; /* means no more chunks to execute */
390  pr->u.p.parm1 = FALSE;
391  break;
392  }
393  }
394 #if USE_ITT_BUILD
395  // Calculate chunk for metadata report
396  if (itt_need_metadata_reporting)
397  if (cur_chunk)
398  *cur_chunk = limit - init + 1;
399 #endif
400  if (st == 1) {
401  pr->u.p.lb = lb + init;
402  pr->u.p.ub = lb + limit;
403  } else {
404  // calculated upper bound, "ub" is user-defined upper bound
405  T ub_tmp = lb + limit * st;
406  pr->u.p.lb = lb + init * st;
407  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
408  // it exactly
409  if (st > 0) {
410  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
411  } else {
412  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
413  }
414  }
415  if (pr->flags.ordered) {
416  pr->u.p.ordered_lower = init;
417  pr->u.p.ordered_upper = limit;
418  }
419  break;
420  } // case
421 #if OMP_45_ENABLED
422  case kmp_sch_static_balanced_chunked: {
423  // similar to balanced, but chunk adjusted to multiple of simd width
424  T nth = nproc;
425  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
426  " -> falling-through to static_greedy\n",
427  gtid));
428  schedule = kmp_sch_static_greedy;
429  if (nth > 1)
430  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
431  else
432  pr->u.p.parm1 = tc;
433  break;
434  } // case
435  case kmp_sch_guided_simd:
436 #endif // OMP_45_ENABLED
437  case kmp_sch_guided_iterative_chunked: {
438  KD_TRACE(
439  100,
440  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
441  " case\n",
442  gtid));
443 
444  if (nproc > 1) {
445  if ((2L * chunk + 1) * nproc >= tc) {
446  /* chunk size too large, switch to dynamic */
447  schedule = kmp_sch_dynamic_chunked;
448  } else {
449  // when remaining iters become less than parm2 - switch to dynamic
450  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
451  *(double *)&pr->u.p.parm3 =
452  guided_flt_param / nproc; // may occupy parm3 and parm4
453  }
454  } else {
455  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
456  "kmp_sch_static_greedy\n",
457  gtid));
458  schedule = kmp_sch_static_greedy;
459  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
460  KD_TRACE(
461  100,
462  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
463  gtid));
464  pr->u.p.parm1 = tc;
465  } // if
466  } // case
467  break;
468  case kmp_sch_guided_analytical_chunked: {
469  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
470  "kmp_sch_guided_analytical_chunked case\n",
471  gtid));
472 
473  if (nproc > 1) {
474  if ((2L * chunk + 1) * nproc >= tc) {
475  /* chunk size too large, switch to dynamic */
476  schedule = kmp_sch_dynamic_chunked;
477  } else {
478  /* commonly used term: (2 nproc - 1)/(2 nproc) */
479  DBL x;
480 
481 #if KMP_USE_X87CONTROL
482  /* Linux* OS already has 64-bit computation by default for long double,
483  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
484  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
485  instead of the default 53-bit. Even though long double doesn't work
486  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
487  expected to impact the correctness of the algorithm, but this has not
488  been mathematically proven. */
489  // save original FPCW and set precision to 64-bit, as
490  // Windows* OS on IA-32 architecture defaults to 53-bit
491  unsigned int oldFpcw = _control87(0, 0);
492  _control87(_PC_64, _MCW_PC); // 0,0x30000
493 #endif
494  /* value used for comparison in solver for cross-over point */
495  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
496 
497  /* crossover point--chunk indexes equal to or greater than
498  this point switch to dynamic-style scheduling */
499  UT cross;
500 
501  /* commonly used term: (2 nproc - 1)/(2 nproc) */
502  x = (long double)1.0 - (long double)0.5 / nproc;
503 
504 #ifdef KMP_DEBUG
505  { // test natural alignment
506  struct _test_a {
507  char a;
508  union {
509  char b;
510  DBL d;
511  };
512  } t;
513  ptrdiff_t natural_alignment =
514  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
515  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
516  // long)natural_alignment );
517  KMP_DEBUG_ASSERT(
518  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
519  }
520 #endif // KMP_DEBUG
521 
522  /* save the term in thread private dispatch structure */
523  *(DBL *)&pr->u.p.parm3 = x;
524 
525  /* solve for the crossover point to the nearest integer i for which C_i
526  <= chunk */
527  {
528  UT left, right, mid;
529  long double p;
530 
531  /* estimate initial upper and lower bound */
532 
533  /* doesn't matter what value right is as long as it is positive, but
534  it affects performance of the solver */
535  right = 229;
536  p = __kmp_pow<UT>(x, right);
537  if (p > target) {
538  do {
539  p *= p;
540  right <<= 1;
541  } while (p > target && right < (1 << 27));
542  /* lower bound is previous (failed) estimate of upper bound */
543  left = right >> 1;
544  } else {
545  left = 0;
546  }
547 
548  /* bisection root-finding method */
549  while (left + 1 < right) {
550  mid = (left + right) / 2;
551  if (__kmp_pow<UT>(x, mid) > target) {
552  left = mid;
553  } else {
554  right = mid;
555  }
556  } // while
557  cross = right;
558  }
559  /* assert sanity of computed crossover point */
560  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
561  __kmp_pow<UT>(x, cross) <= target);
562 
563  /* save the crossover point in thread private dispatch structure */
564  pr->u.p.parm2 = cross;
565 
566 // C75803
567 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
568 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
569 #else
570 #define GUIDED_ANALYTICAL_WORKAROUND (x)
571 #endif
572  /* dynamic-style scheduling offset */
573  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
574  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
575  cross * chunk;
576 #if KMP_USE_X87CONTROL
577  // restore FPCW
578  _control87(oldFpcw, _MCW_PC);
579 #endif
580  } // if
581  } else {
582  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
583  "kmp_sch_static_greedy\n",
584  gtid));
585  schedule = kmp_sch_static_greedy;
586  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
587  pr->u.p.parm1 = tc;
588  } // if
589  } // case
590  break;
591  case kmp_sch_static_greedy:
592  KD_TRACE(
593  100,
594  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
595  gtid));
596  pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
597  break;
598  case kmp_sch_static_chunked:
599  case kmp_sch_dynamic_chunked:
600  if (pr->u.p.parm1 <= 0) {
601  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
602  }
603  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
604  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
605  gtid));
606  break;
607  case kmp_sch_trapezoidal: {
608  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
609 
610  T parm1, parm2, parm3, parm4;
611  KD_TRACE(100,
612  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
613  gtid));
614 
615  parm1 = chunk;
616 
617  /* F : size of the first cycle */
618  parm2 = (tc / (2 * nproc));
619 
620  if (parm2 < 1) {
621  parm2 = 1;
622  }
623 
624  /* L : size of the last cycle. Make sure the last cycle is not larger
625  than the first cycle. */
626  if (parm1 < 1) {
627  parm1 = 1;
628  } else if (parm1 > parm2) {
629  parm1 = parm2;
630  }
631 
632  /* N : number of cycles */
633  parm3 = (parm2 + parm1);
634  parm3 = (2 * tc + parm3 - 1) / parm3;
635 
636  if (parm3 < 2) {
637  parm3 = 2;
638  }
639 
640  /* sigma : decreasing incr of the trapezoid */
641  parm4 = (parm3 - 1);
642  parm4 = (parm2 - parm1) / parm4;
643 
644  // pointless check, because parm4 >= 0 always
645  // if ( parm4 < 0 ) {
646  // parm4 = 0;
647  //}
648 
649  pr->u.p.parm1 = parm1;
650  pr->u.p.parm2 = parm2;
651  pr->u.p.parm3 = parm3;
652  pr->u.p.parm4 = parm4;
653  } // case
654  break;
655 
656  default: {
657  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
658  KMP_HNT(GetNewerLibrary), // Hint
659  __kmp_msg_null // Variadic argument list terminator
660  );
661  } break;
662  } // switch
663  pr->schedule = schedule;
664 }
665 
666 #if KMP_USE_HIER_SCHED
667 template <typename T>
668 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
669  typename traits_t<T>::signed_t st);
670 template <>
671 inline void
672 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
673  kmp_int32 ub, kmp_int32 st) {
674  __kmp_dispatch_init_hierarchy<kmp_int32>(
675  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
676  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
677 }
678 template <>
679 inline void
680 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
681  kmp_uint32 ub, kmp_int32 st) {
682  __kmp_dispatch_init_hierarchy<kmp_uint32>(
683  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
684  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
685 }
686 template <>
687 inline void
688 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
689  kmp_int64 ub, kmp_int64 st) {
690  __kmp_dispatch_init_hierarchy<kmp_int64>(
691  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
692  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
693 }
694 template <>
695 inline void
696 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
697  kmp_uint64 ub, kmp_int64 st) {
698  __kmp_dispatch_init_hierarchy<kmp_uint64>(
699  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
700  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
701 }
702 
703 // free all the hierarchy scheduling memory associated with the team
704 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
705  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
706  for (int i = 0; i < num_disp_buff; ++i) {
707  // type does not matter here so use kmp_int32
708  auto sh =
709  reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
710  &team->t.t_disp_buffer[i]);
711  if (sh->hier) {
712  sh->hier->deallocate();
713  __kmp_free(sh->hier);
714  }
715  }
716 }
717 #endif
718 
719 // UT - unsigned flavor of T, ST - signed flavor of T,
720 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
721 template <typename T>
722 static void
723 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
724  T ub, typename traits_t<T>::signed_t st,
725  typename traits_t<T>::signed_t chunk, int push_ws) {
726  typedef typename traits_t<T>::unsigned_t UT;
727 
728  int active;
729  kmp_info_t *th;
730  kmp_team_t *team;
731  kmp_uint32 my_buffer_index;
732  dispatch_private_info_template<T> *pr;
733  dispatch_shared_info_template<T> volatile *sh;
734 
735  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
736  sizeof(dispatch_private_info));
737  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
738  sizeof(dispatch_shared_info));
739 
740  if (!TCR_4(__kmp_init_parallel))
741  __kmp_parallel_initialize();
742 
743 #if INCLUDE_SSC_MARKS
744  SSC_MARK_DISPATCH_INIT();
745 #endif
746 #ifdef KMP_DEBUG
747  typedef typename traits_t<T>::signed_t ST;
748  {
749  char *buff;
750  // create format specifiers before the debug output
751  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
752  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
753  traits_t<ST>::spec, traits_t<T>::spec,
754  traits_t<T>::spec, traits_t<ST>::spec);
755  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
756  __kmp_str_free(&buff);
757  }
758 #endif
759  /* setup data */
760  th = __kmp_threads[gtid];
761  team = th->th.th_team;
762  active = !team->t.t_serialized;
763  th->th.th_ident = loc;
764 
765  // Any half-decent optimizer will remove this test when the blocks are empty
766  // since the macros expand to nothing
767  // when statistics are disabled.
768  if (schedule == __kmp_static) {
769  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
770  } else {
771  KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
772  }
773 
774 #if KMP_USE_HIER_SCHED
775  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
776  // Hierarchical scheduling does not work with ordered, so if ordered is
777  // detected, then revert back to threaded scheduling.
778  bool ordered;
779  enum sched_type my_sched = schedule;
780  my_buffer_index = th->th.th_dispatch->th_disp_index;
781  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
782  &th->th.th_dispatch
783  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
784  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
785  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
786  my_sched =
787  (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
788  ordered = (kmp_ord_lower & my_sched);
789  if (pr->flags.use_hier) {
790  if (ordered) {
791  KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
792  "Disabling hierarchical scheduling.\n",
793  gtid));
794  pr->flags.use_hier = FALSE;
795  }
796  }
797  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
798  // Don't use hierarchical for ordered parallel loops and don't
799  // use the runtime hierarchy if one was specified in the program
800  if (!ordered && !pr->flags.use_hier)
801  __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
802  }
803 #endif // KMP_USE_HIER_SCHED
804 
805 #if USE_ITT_BUILD
806  kmp_uint64 cur_chunk = chunk;
807  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
808  __kmp_forkjoin_frames_mode == 3 &&
809  KMP_MASTER_GTID(gtid) &&
810 #if OMP_40_ENABLED
811  th->th.th_teams_microtask == NULL &&
812 #endif
813  team->t.t_active_level == 1;
814 #endif
815  if (!active) {
816  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
817  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
818  } else {
819  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
820  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
821 
822  my_buffer_index = th->th.th_dispatch->th_disp_index++;
823 
824  /* What happens when number of threads changes, need to resize buffer? */
825  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
826  &th->th.th_dispatch
827  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
828  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
829  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
830  KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
831  my_buffer_index));
832  }
833 
834  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
835 #if USE_ITT_BUILD
836  &cur_chunk,
837 #endif
838  chunk, (T)th->th.th_team_nproc,
839  (T)th->th.th_info.ds.ds_tid);
840  if (active) {
841  if (pr->flags.ordered == 0) {
842  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
843  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
844  } else {
845  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
846  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
847  }
848  }
849 
850  if (active) {
851  /* The name of this buffer should be my_buffer_index when it's free to use
852  * it */
853 
854  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
855  "sh->buffer_index:%d\n",
856  gtid, my_buffer_index, sh->buffer_index));
857  __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
858  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
859  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
860  // my_buffer_index are *always* 32-bit integers.
861  KMP_MB(); /* is this necessary? */
862  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
863  "sh->buffer_index:%d\n",
864  gtid, my_buffer_index, sh->buffer_index));
865 
866  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
867  th->th.th_dispatch->th_dispatch_sh_current =
868  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
869 #if USE_ITT_BUILD
870  if (pr->flags.ordered) {
871  __kmp_itt_ordered_init(gtid);
872  }
873  // Report loop metadata
874  if (itt_need_metadata_reporting) {
875  // Only report metadata by master of active team at level 1
876  kmp_uint64 schedtype = 0;
877  switch (schedule) {
878  case kmp_sch_static_chunked:
879  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
880  break;
881  case kmp_sch_static_greedy:
882  cur_chunk = pr->u.p.parm1;
883  break;
884  case kmp_sch_dynamic_chunked:
885  schedtype = 1;
886  break;
887  case kmp_sch_guided_iterative_chunked:
888  case kmp_sch_guided_analytical_chunked:
889 #if OMP_45_ENABLED
890  case kmp_sch_guided_simd:
891 #endif
892  schedtype = 2;
893  break;
894  default:
895  // Should we put this case under "static"?
896  // case kmp_sch_static_steal:
897  schedtype = 3;
898  break;
899  }
900  __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
901  }
902 #if KMP_USE_HIER_SCHED
903  if (pr->flags.use_hier) {
904  pr->u.p.count = 0;
905  pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
906  }
907 #endif // KMP_USER_HIER_SCHED
908 #endif /* USE_ITT_BUILD */
909  }
910 
911 #ifdef KMP_DEBUG
912  {
913  char *buff;
914  // create format specifiers before the debug output
915  buff = __kmp_str_format(
916  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
917  "lb:%%%s ub:%%%s"
918  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
919  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
920  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
921  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
922  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
923  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
924  KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
925  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
926  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
927  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
928  __kmp_str_free(&buff);
929  }
930 #endif
931 #if (KMP_STATIC_STEAL_ENABLED)
932  // It cannot be guaranteed that after execution of a loop with some other
933  // schedule kind all the parm3 variables will contain the same value. Even if
934  // all parm3 will be the same, it still exists a bad case like using 0 and 1
935  // rather than program life-time increment. So the dedicated variable is
936  // required. The 'static_steal_counter' is used.
937  if (schedule == kmp_sch_static_steal) {
938  // Other threads will inspect this variable when searching for a victim.
939  // This is a flag showing that other threads may steal from this thread
940  // since then.
941  volatile T *p = &pr->u.p.static_steal_counter;
942  *p = *p + 1;
943  }
944 #endif // ( KMP_STATIC_STEAL_ENABLED )
945 
946 #if OMPT_SUPPORT && OMPT_OPTIONAL
947  if (ompt_enabled.ompt_callback_work) {
948  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
949  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
950  ompt_callbacks.ompt_callback(ompt_callback_work)(
951  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
952  &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
953  }
954 #endif
955  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
956 }
957 
958 /* For ordered loops, either __kmp_dispatch_finish() should be called after
959  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
960  * every chunk of iterations. If the ordered section(s) were not executed
961  * for this iteration (or every iteration in this chunk), we need to set the
962  * ordered iteration counters so that the next thread can proceed. */
963 template <typename UT>
964 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
965  typedef typename traits_t<UT>::signed_t ST;
966  kmp_info_t *th = __kmp_threads[gtid];
967 
968  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
969  if (!th->th.th_team->t.t_serialized) {
970 
971  dispatch_private_info_template<UT> *pr =
972  reinterpret_cast<dispatch_private_info_template<UT> *>(
973  th->th.th_dispatch->th_dispatch_pr_current);
974  dispatch_shared_info_template<UT> volatile *sh =
975  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
976  th->th.th_dispatch->th_dispatch_sh_current);
977  KMP_DEBUG_ASSERT(pr);
978  KMP_DEBUG_ASSERT(sh);
979  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
980  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
981 
982  if (pr->ordered_bumped) {
983  KD_TRACE(
984  1000,
985  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
986  gtid));
987  pr->ordered_bumped = 0;
988  } else {
989  UT lower = pr->u.p.ordered_lower;
990 
991 #ifdef KMP_DEBUG
992  {
993  char *buff;
994  // create format specifiers before the debug output
995  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
996  "ordered_iteration:%%%s lower:%%%s\n",
997  traits_t<UT>::spec, traits_t<UT>::spec);
998  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
999  __kmp_str_free(&buff);
1000  }
1001 #endif
1002 
1003  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1004  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1005  KMP_MB(); /* is this necessary? */
1006 #ifdef KMP_DEBUG
1007  {
1008  char *buff;
1009  // create format specifiers before the debug output
1010  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1011  "ordered_iteration:%%%s lower:%%%s\n",
1012  traits_t<UT>::spec, traits_t<UT>::spec);
1013  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1014  __kmp_str_free(&buff);
1015  }
1016 #endif
1017 
1018  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1019  } // if
1020  } // if
1021  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1022 }
1023 
1024 #ifdef KMP_GOMP_COMPAT
1025 
1026 template <typename UT>
1027 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1028  typedef typename traits_t<UT>::signed_t ST;
1029  kmp_info_t *th = __kmp_threads[gtid];
1030 
1031  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1032  if (!th->th.th_team->t.t_serialized) {
1033  // int cid;
1034  dispatch_private_info_template<UT> *pr =
1035  reinterpret_cast<dispatch_private_info_template<UT> *>(
1036  th->th.th_dispatch->th_dispatch_pr_current);
1037  dispatch_shared_info_template<UT> volatile *sh =
1038  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1039  th->th.th_dispatch->th_dispatch_sh_current);
1040  KMP_DEBUG_ASSERT(pr);
1041  KMP_DEBUG_ASSERT(sh);
1042  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1043  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1044 
1045  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1046  UT lower = pr->u.p.ordered_lower;
1047  UT upper = pr->u.p.ordered_upper;
1048  UT inc = upper - lower + 1;
1049 
1050  if (pr->ordered_bumped == inc) {
1051  KD_TRACE(
1052  1000,
1053  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1054  gtid));
1055  pr->ordered_bumped = 0;
1056  } else {
1057  inc -= pr->ordered_bumped;
1058 
1059 #ifdef KMP_DEBUG
1060  {
1061  char *buff;
1062  // create format specifiers before the debug output
1063  buff = __kmp_str_format(
1064  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1065  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1066  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1067  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1068  __kmp_str_free(&buff);
1069  }
1070 #endif
1071 
1072  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1073  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1074 
1075  KMP_MB(); /* is this necessary? */
1076  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1077  "ordered_bumped to zero\n",
1078  gtid));
1079  pr->ordered_bumped = 0;
1081 #ifdef KMP_DEBUG
1082  {
1083  char *buff;
1084  // create format specifiers before the debug output
1085  buff = __kmp_str_format(
1086  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1087  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1088  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1089  traits_t<UT>::spec);
1090  KD_TRACE(1000,
1091  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1092  __kmp_str_free(&buff);
1093  }
1094 #endif
1095 
1096  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1097  }
1098  // }
1099  }
1100  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1101 }
1102 
1103 #endif /* KMP_GOMP_COMPAT */
1104 
1105 template <typename T>
1106 int __kmp_dispatch_next_algorithm(int gtid,
1107  dispatch_private_info_template<T> *pr,
1108  dispatch_shared_info_template<T> volatile *sh,
1109  kmp_int32 *p_last, T *p_lb, T *p_ub,
1110  typename traits_t<T>::signed_t *p_st, T nproc,
1111  T tid) {
1112  typedef typename traits_t<T>::unsigned_t UT;
1113  typedef typename traits_t<T>::signed_t ST;
1114  typedef typename traits_t<T>::floating_t DBL;
1115  int status = 0;
1116  kmp_int32 last = 0;
1117  T start;
1118  ST incr;
1119  UT limit, trip, init;
1120  kmp_info_t *th = __kmp_threads[gtid];
1121  kmp_team_t *team = th->th.th_team;
1122 
1123  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1124  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1125  KMP_DEBUG_ASSERT(pr);
1126  KMP_DEBUG_ASSERT(sh);
1127  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1128 #ifdef KMP_DEBUG
1129  {
1130  char *buff;
1131  // create format specifiers before the debug output
1132  buff =
1133  __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1134  "sh:%%p nproc:%%%s tid:%%%s\n",
1135  traits_t<T>::spec, traits_t<T>::spec);
1136  KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1137  __kmp_str_free(&buff);
1138  }
1139 #endif
1140 
1141  // zero trip count
1142  if (pr->u.p.tc == 0) {
1143  KD_TRACE(10,
1144  ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1145  "zero status:%d\n",
1146  gtid, status));
1147  return 0;
1148  }
1149 
1150  switch (pr->schedule) {
1151 #if (KMP_STATIC_STEAL_ENABLED)
1152  case kmp_sch_static_steal: {
1153  T chunk = pr->u.p.parm1;
1154 
1155  KD_TRACE(100,
1156  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1157  gtid));
1158 
1159  trip = pr->u.p.tc - 1;
1160 
1161  if (traits_t<T>::type_size > 4) {
1162  // use lock for 8-byte and CAS for 4-byte induction
1163  // variable. TODO (optional): check and use 16-byte CAS
1164  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1165  KMP_DEBUG_ASSERT(lck != NULL);
1166  if (pr->u.p.count < (UT)pr->u.p.ub) {
1167  __kmp_acquire_lock(lck, gtid);
1168  // try to get own chunk of iterations
1169  init = (pr->u.p.count)++;
1170  status = (init < (UT)pr->u.p.ub);
1171  __kmp_release_lock(lck, gtid);
1172  } else {
1173  status = 0; // no own chunks
1174  }
1175  if (!status) { // try to steal
1176  kmp_info_t **other_threads = team->t.t_threads;
1177  int while_limit = nproc; // nproc attempts to find a victim
1178  int while_index = 0;
1179  // TODO: algorithm of searching for a victim
1180  // should be cleaned up and measured
1181  while ((!status) && (while_limit != ++while_index)) {
1182  T remaining;
1183  T victimIdx = pr->u.p.parm4;
1184  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1185  dispatch_private_info_template<T> *victim =
1186  reinterpret_cast<dispatch_private_info_template<T> *>(
1187  other_threads[victimIdx]
1188  ->th.th_dispatch->th_dispatch_pr_current);
1189  while ((victim == NULL || victim == pr ||
1190  (*(volatile T *)&victim->u.p.static_steal_counter !=
1191  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1192  oldVictimIdx != victimIdx) {
1193  victimIdx = (victimIdx + 1) % nproc;
1194  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1195  other_threads[victimIdx]
1196  ->th.th_dispatch->th_dispatch_pr_current);
1197  }
1198  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1199  *(volatile T *)&pr->u.p.static_steal_counter)) {
1200  continue; // try once more (nproc attempts in total)
1201  // no victim is ready yet to participate in stealing
1202  // because all victims are still in kmp_init_dispatch
1203  }
1204  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1205  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1206  continue; // not enough chunks to steal, goto next victim
1207  }
1208 
1209  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1210  KMP_ASSERT(lck != NULL);
1211  __kmp_acquire_lock(lck, gtid);
1212  limit = victim->u.p.ub; // keep initial ub
1213  if (victim->u.p.count >= limit ||
1214  (remaining = limit - victim->u.p.count) < 2) {
1215  __kmp_release_lock(lck, gtid);
1216  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1217  continue; // not enough chunks to steal
1218  }
1219  // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1220  // by 1
1221  if (remaining > 3) {
1222  // steal 1/4 of remaining
1223  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1224  init = (victim->u.p.ub -= (remaining >> 2));
1225  } else {
1226  // steal 1 chunk of 2 or 3 remaining
1227  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1228  init = (victim->u.p.ub -= 1);
1229  }
1230  __kmp_release_lock(lck, gtid);
1231 
1232  KMP_DEBUG_ASSERT(init + 1 <= limit);
1233  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1234  status = 1;
1235  while_index = 0;
1236  // now update own count and ub with stolen range but init chunk
1237  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1238  pr->u.p.count = init + 1;
1239  pr->u.p.ub = limit;
1240  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1241  } // while (search for victim)
1242  } // if (try to find victim and steal)
1243  } else {
1244  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1245  typedef union {
1246  struct {
1247  UT count;
1248  T ub;
1249  } p;
1250  kmp_int64 b;
1251  } union_i4;
1252  // All operations on 'count' or 'ub' must be combined atomically
1253  // together.
1254  {
1255  union_i4 vold, vnew;
1256  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1257  vnew = vold;
1258  vnew.p.count++;
1259  while (!KMP_COMPARE_AND_STORE_ACQ64(
1260  (volatile kmp_int64 *)&pr->u.p.count,
1261  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1262  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1263  KMP_CPU_PAUSE();
1264  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1265  vnew = vold;
1266  vnew.p.count++;
1267  }
1268  vnew = vold;
1269  init = vnew.p.count;
1270  status = (init < (UT)vnew.p.ub);
1271  }
1272 
1273  if (!status) {
1274  kmp_info_t **other_threads = team->t.t_threads;
1275  int while_limit = nproc; // nproc attempts to find a victim
1276  int while_index = 0;
1277 
1278  // TODO: algorithm of searching for a victim
1279  // should be cleaned up and measured
1280  while ((!status) && (while_limit != ++while_index)) {
1281  union_i4 vold, vnew;
1282  kmp_int32 remaining;
1283  T victimIdx = pr->u.p.parm4;
1284  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1285  dispatch_private_info_template<T> *victim =
1286  reinterpret_cast<dispatch_private_info_template<T> *>(
1287  other_threads[victimIdx]
1288  ->th.th_dispatch->th_dispatch_pr_current);
1289  while ((victim == NULL || victim == pr ||
1290  (*(volatile T *)&victim->u.p.static_steal_counter !=
1291  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1292  oldVictimIdx != victimIdx) {
1293  victimIdx = (victimIdx + 1) % nproc;
1294  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1295  other_threads[victimIdx]
1296  ->th.th_dispatch->th_dispatch_pr_current);
1297  }
1298  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1299  *(volatile T *)&pr->u.p.static_steal_counter)) {
1300  continue; // try once more (nproc attempts in total)
1301  // no victim is ready yet to participate in stealing
1302  // because all victims are still in kmp_init_dispatch
1303  }
1304  pr->u.p.parm4 = victimIdx; // new victim found
1305  while (1) { // CAS loop if victim has enough chunks to steal
1306  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1307  vnew = vold;
1308 
1309  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1310  if (vnew.p.count >= (UT)vnew.p.ub ||
1311  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1312  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1313  break; // not enough chunks to steal, goto next victim
1314  }
1315  if (remaining > 3) {
1316  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1317  } else {
1318  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1319  }
1320  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1321  // TODO: Should this be acquire or release?
1322  if (KMP_COMPARE_AND_STORE_ACQ64(
1323  (volatile kmp_int64 *)&victim->u.p.count,
1324  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1325  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1326  // stealing succedded
1327  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1328  vold.p.ub - vnew.p.ub);
1329  status = 1;
1330  while_index = 0;
1331  // now update own count and ub
1332  init = vnew.p.ub;
1333  vold.p.count = init + 1;
1334 #if KMP_ARCH_X86
1335  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1336 #else
1337  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1338 #endif
1339  break;
1340  } // if (check CAS result)
1341  KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1342  } // while (try to steal from particular victim)
1343  } // while (search for victim)
1344  } // if (try to find victim and steal)
1345  } // if (4-byte induction variable)
1346  if (!status) {
1347  *p_lb = 0;
1348  *p_ub = 0;
1349  if (p_st != NULL)
1350  *p_st = 0;
1351  } else {
1352  start = pr->u.p.parm2;
1353  init *= chunk;
1354  limit = chunk + init - 1;
1355  incr = pr->u.p.st;
1356  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1357 
1358  KMP_DEBUG_ASSERT(init <= trip);
1359  if ((last = (limit >= trip)) != 0)
1360  limit = trip;
1361  if (p_st != NULL)
1362  *p_st = incr;
1363 
1364  if (incr == 1) {
1365  *p_lb = start + init;
1366  *p_ub = start + limit;
1367  } else {
1368  *p_lb = start + init * incr;
1369  *p_ub = start + limit * incr;
1370  }
1371 
1372  if (pr->flags.ordered) {
1373  pr->u.p.ordered_lower = init;
1374  pr->u.p.ordered_upper = limit;
1375  } // if
1376  } // if
1377  break;
1378  } // case
1379 #endif // ( KMP_STATIC_STEAL_ENABLED )
1380  case kmp_sch_static_balanced: {
1381  KD_TRACE(
1382  10,
1383  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1384  gtid));
1385  /* check if thread has any iteration to do */
1386  if ((status = !pr->u.p.count) != 0) {
1387  pr->u.p.count = 1;
1388  *p_lb = pr->u.p.lb;
1389  *p_ub = pr->u.p.ub;
1390  last = pr->u.p.parm1;
1391  if (p_st != NULL)
1392  *p_st = pr->u.p.st;
1393  } else { /* no iterations to do */
1394  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1395  }
1396  } // case
1397  break;
1398  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1399  merged here */
1400  case kmp_sch_static_chunked: {
1401  T parm1;
1402 
1403  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1404  "kmp_sch_static_[affinity|chunked] case\n",
1405  gtid));
1406  parm1 = pr->u.p.parm1;
1407 
1408  trip = pr->u.p.tc - 1;
1409  init = parm1 * (pr->u.p.count + tid);
1410 
1411  if ((status = (init <= trip)) != 0) {
1412  start = pr->u.p.lb;
1413  incr = pr->u.p.st;
1414  limit = parm1 + init - 1;
1415 
1416  if ((last = (limit >= trip)) != 0)
1417  limit = trip;
1418 
1419  if (p_st != NULL)
1420  *p_st = incr;
1421 
1422  pr->u.p.count += nproc;
1423 
1424  if (incr == 1) {
1425  *p_lb = start + init;
1426  *p_ub = start + limit;
1427  } else {
1428  *p_lb = start + init * incr;
1429  *p_ub = start + limit * incr;
1430  }
1431 
1432  if (pr->flags.ordered) {
1433  pr->u.p.ordered_lower = init;
1434  pr->u.p.ordered_upper = limit;
1435  } // if
1436  } // if
1437  } // case
1438  break;
1439 
1440  case kmp_sch_dynamic_chunked: {
1441  T chunk = pr->u.p.parm1;
1442 
1443  KD_TRACE(
1444  100,
1445  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1446  gtid));
1447 
1448  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1449  trip = pr->u.p.tc - 1;
1450 
1451  if ((status = (init <= trip)) == 0) {
1452  *p_lb = 0;
1453  *p_ub = 0;
1454  if (p_st != NULL)
1455  *p_st = 0;
1456  } else {
1457  start = pr->u.p.lb;
1458  limit = chunk + init - 1;
1459  incr = pr->u.p.st;
1460 
1461  if ((last = (limit >= trip)) != 0)
1462  limit = trip;
1463 
1464  if (p_st != NULL)
1465  *p_st = incr;
1466 
1467  if (incr == 1) {
1468  *p_lb = start + init;
1469  *p_ub = start + limit;
1470  } else {
1471  *p_lb = start + init * incr;
1472  *p_ub = start + limit * incr;
1473  }
1474 
1475  if (pr->flags.ordered) {
1476  pr->u.p.ordered_lower = init;
1477  pr->u.p.ordered_upper = limit;
1478  } // if
1479  } // if
1480  } // case
1481  break;
1482 
1483  case kmp_sch_guided_iterative_chunked: {
1484  T chunkspec = pr->u.p.parm1;
1485  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1486  "iterative case\n",
1487  gtid));
1488  trip = pr->u.p.tc;
1489  // Start atomic part of calculations
1490  while (1) {
1491  ST remaining; // signed, because can be < 0
1492  init = sh->u.s.iteration; // shared value
1493  remaining = trip - init;
1494  if (remaining <= 0) { // AC: need to compare with 0 first
1495  // nothing to do, don't try atomic op
1496  status = 0;
1497  break;
1498  }
1499  if ((T)remaining <
1500  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1501  // use dynamic-style shcedule
1502  // atomically inrement iterations, get old value
1503  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1504  (ST)chunkspec);
1505  remaining = trip - init;
1506  if (remaining <= 0) {
1507  status = 0; // all iterations got by other threads
1508  } else {
1509  // got some iterations to work on
1510  status = 1;
1511  if ((T)remaining > chunkspec) {
1512  limit = init + chunkspec - 1;
1513  } else {
1514  last = 1; // the last chunk
1515  limit = init + remaining - 1;
1516  } // if
1517  } // if
1518  break;
1519  } // if
1520  limit = init +
1521  (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1522  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1523  (ST)init, (ST)limit)) {
1524  // CAS was successful, chunk obtained
1525  status = 1;
1526  --limit;
1527  break;
1528  } // if
1529  } // while
1530  if (status != 0) {
1531  start = pr->u.p.lb;
1532  incr = pr->u.p.st;
1533  if (p_st != NULL)
1534  *p_st = incr;
1535  *p_lb = start + init * incr;
1536  *p_ub = start + limit * incr;
1537  if (pr->flags.ordered) {
1538  pr->u.p.ordered_lower = init;
1539  pr->u.p.ordered_upper = limit;
1540  } // if
1541  } else {
1542  *p_lb = 0;
1543  *p_ub = 0;
1544  if (p_st != NULL)
1545  *p_st = 0;
1546  } // if
1547  } // case
1548  break;
1549 
1550 #if OMP_45_ENABLED
1551  case kmp_sch_guided_simd: {
1552  // same as iterative but curr-chunk adjusted to be multiple of given
1553  // chunk
1554  T chunk = pr->u.p.parm1;
1555  KD_TRACE(100,
1556  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1557  gtid));
1558  trip = pr->u.p.tc;
1559  // Start atomic part of calculations
1560  while (1) {
1561  ST remaining; // signed, because can be < 0
1562  init = sh->u.s.iteration; // shared value
1563  remaining = trip - init;
1564  if (remaining <= 0) { // AC: need to compare with 0 first
1565  status = 0; // nothing to do, don't try atomic op
1566  break;
1567  }
1568  KMP_DEBUG_ASSERT(init % chunk == 0);
1569  // compare with K*nproc*(chunk+1), K=2 by default
1570  if ((T)remaining < pr->u.p.parm2) {
1571  // use dynamic-style shcedule
1572  // atomically inrement iterations, get old value
1573  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1574  (ST)chunk);
1575  remaining = trip - init;
1576  if (remaining <= 0) {
1577  status = 0; // all iterations got by other threads
1578  } else {
1579  // got some iterations to work on
1580  status = 1;
1581  if ((T)remaining > chunk) {
1582  limit = init + chunk - 1;
1583  } else {
1584  last = 1; // the last chunk
1585  limit = init + remaining - 1;
1586  } // if
1587  } // if
1588  break;
1589  } // if
1590  // divide by K*nproc
1591  UT span = remaining * (*(double *)&pr->u.p.parm3);
1592  UT rem = span % chunk;
1593  if (rem) // adjust so that span%chunk == 0
1594  span += chunk - rem;
1595  limit = init + span;
1596  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1597  (ST)init, (ST)limit)) {
1598  // CAS was successful, chunk obtained
1599  status = 1;
1600  --limit;
1601  break;
1602  } // if
1603  } // while
1604  if (status != 0) {
1605  start = pr->u.p.lb;
1606  incr = pr->u.p.st;
1607  if (p_st != NULL)
1608  *p_st = incr;
1609  *p_lb = start + init * incr;
1610  *p_ub = start + limit * incr;
1611  if (pr->flags.ordered) {
1612  pr->u.p.ordered_lower = init;
1613  pr->u.p.ordered_upper = limit;
1614  } // if
1615  } else {
1616  *p_lb = 0;
1617  *p_ub = 0;
1618  if (p_st != NULL)
1619  *p_st = 0;
1620  } // if
1621  } // case
1622  break;
1623 #endif // OMP_45_ENABLED
1624 
1625  case kmp_sch_guided_analytical_chunked: {
1626  T chunkspec = pr->u.p.parm1;
1627  UT chunkIdx;
1628 #if KMP_USE_X87CONTROL
1629  /* for storing original FPCW value for Windows* OS on
1630  IA-32 architecture 8-byte version */
1631  unsigned int oldFpcw;
1632  unsigned int fpcwSet = 0;
1633 #endif
1634  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1635  "kmp_sch_guided_analytical_chunked case\n",
1636  gtid));
1637 
1638  trip = pr->u.p.tc;
1639 
1640  KMP_DEBUG_ASSERT(nproc > 1);
1641  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1642 
1643  while (1) { /* this while loop is a safeguard against unexpected zero
1644  chunk sizes */
1645  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1646  if (chunkIdx >= (UT)pr->u.p.parm2) {
1647  --trip;
1648  /* use dynamic-style scheduling */
1649  init = chunkIdx * chunkspec + pr->u.p.count;
1650  /* need to verify init > 0 in case of overflow in the above
1651  * calculation */
1652  if ((status = (init > 0 && init <= trip)) != 0) {
1653  limit = init + chunkspec - 1;
1654 
1655  if ((last = (limit >= trip)) != 0)
1656  limit = trip;
1657  }
1658  break;
1659  } else {
1660 /* use exponential-style scheduling */
1661 /* The following check is to workaround the lack of long double precision on
1662  Windows* OS.
1663  This check works around the possible effect that init != 0 for chunkIdx == 0.
1664  */
1665 #if KMP_USE_X87CONTROL
1666  /* If we haven't already done so, save original
1667  FPCW and set precision to 64-bit, as Windows* OS
1668  on IA-32 architecture defaults to 53-bit */
1669  if (!fpcwSet) {
1670  oldFpcw = _control87(0, 0);
1671  _control87(_PC_64, _MCW_PC);
1672  fpcwSet = 0x30000;
1673  }
1674 #endif
1675  if (chunkIdx) {
1676  init = __kmp_dispatch_guided_remaining<T>(
1677  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1678  KMP_DEBUG_ASSERT(init);
1679  init = trip - init;
1680  } else
1681  init = 0;
1682  limit = trip - __kmp_dispatch_guided_remaining<T>(
1683  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1684  KMP_ASSERT(init <= limit);
1685  if (init < limit) {
1686  KMP_DEBUG_ASSERT(limit <= trip);
1687  --limit;
1688  status = 1;
1689  break;
1690  } // if
1691  } // if
1692  } // while (1)
1693 #if KMP_USE_X87CONTROL
1694  /* restore FPCW if necessary
1695  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1696  */
1697  if (fpcwSet && (oldFpcw & fpcwSet))
1698  _control87(oldFpcw, _MCW_PC);
1699 #endif
1700  if (status != 0) {
1701  start = pr->u.p.lb;
1702  incr = pr->u.p.st;
1703  if (p_st != NULL)
1704  *p_st = incr;
1705  *p_lb = start + init * incr;
1706  *p_ub = start + limit * incr;
1707  if (pr->flags.ordered) {
1708  pr->u.p.ordered_lower = init;
1709  pr->u.p.ordered_upper = limit;
1710  }
1711  } else {
1712  *p_lb = 0;
1713  *p_ub = 0;
1714  if (p_st != NULL)
1715  *p_st = 0;
1716  }
1717  } // case
1718  break;
1719 
1720  case kmp_sch_trapezoidal: {
1721  UT index;
1722  T parm2 = pr->u.p.parm2;
1723  T parm3 = pr->u.p.parm3;
1724  T parm4 = pr->u.p.parm4;
1725  KD_TRACE(100,
1726  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1727  gtid));
1728 
1729  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1730 
1731  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1732  trip = pr->u.p.tc - 1;
1733 
1734  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1735  *p_lb = 0;
1736  *p_ub = 0;
1737  if (p_st != NULL)
1738  *p_st = 0;
1739  } else {
1740  start = pr->u.p.lb;
1741  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1742  incr = pr->u.p.st;
1743 
1744  if ((last = (limit >= trip)) != 0)
1745  limit = trip;
1746 
1747  if (p_st != NULL)
1748  *p_st = incr;
1749 
1750  if (incr == 1) {
1751  *p_lb = start + init;
1752  *p_ub = start + limit;
1753  } else {
1754  *p_lb = start + init * incr;
1755  *p_ub = start + limit * incr;
1756  }
1757 
1758  if (pr->flags.ordered) {
1759  pr->u.p.ordered_lower = init;
1760  pr->u.p.ordered_upper = limit;
1761  } // if
1762  } // if
1763  } // case
1764  break;
1765  default: {
1766  status = 0; // to avoid complaints on uninitialized variable use
1767  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1768  KMP_HNT(GetNewerLibrary), // Hint
1769  __kmp_msg_null // Variadic argument list terminator
1770  );
1771  } break;
1772  } // switch
1773  if (p_last)
1774  *p_last = last;
1775 #ifdef KMP_DEBUG
1776  if (pr->flags.ordered) {
1777  char *buff;
1778  // create format specifiers before the debug output
1779  buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1780  "ordered_lower:%%%s ordered_upper:%%%s\n",
1781  traits_t<UT>::spec, traits_t<UT>::spec);
1782  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1783  __kmp_str_free(&buff);
1784  }
1785  {
1786  char *buff;
1787  // create format specifiers before the debug output
1788  buff = __kmp_str_format(
1789  "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1790  "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1791  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1792  KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1793  __kmp_str_free(&buff);
1794  }
1795 #endif
1796  return status;
1797 }
1798 
1799 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1800  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1801  is not called. */
1802 #if OMPT_SUPPORT && OMPT_OPTIONAL
1803 #define OMPT_LOOP_END \
1804  if (status == 0) { \
1805  if (ompt_enabled.ompt_callback_work) { \
1806  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1807  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1808  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1809  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1810  &(task_info->task_data), 0, codeptr); \
1811  } \
1812  }
1813 // TODO: implement count
1814 #else
1815 #define OMPT_LOOP_END // no-op
1816 #endif
1817 
1818 #if KMP_STATS_ENABLED
1819 #define KMP_STATS_LOOP_END \
1820  { \
1821  kmp_int64 u, l, t, i; \
1822  l = (kmp_int64)(*p_lb); \
1823  u = (kmp_int64)(*p_ub); \
1824  i = (kmp_int64)(pr->u.p.st); \
1825  if (status == 0) { \
1826  t = 0; \
1827  KMP_POP_PARTITIONED_TIMER(); \
1828  } else if (i == 1) { \
1829  if (u >= l) \
1830  t = u - l + 1; \
1831  else \
1832  t = 0; \
1833  } else if (i < 0) { \
1834  if (l >= u) \
1835  t = (l - u) / (-i) + 1; \
1836  else \
1837  t = 0; \
1838  } else { \
1839  if (u >= l) \
1840  t = (u - l) / i + 1; \
1841  else \
1842  t = 0; \
1843  } \
1844  KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1845  }
1846 #else
1847 #define KMP_STATS_LOOP_END /* Nothing */
1848 #endif
1849 
1850 template <typename T>
1851 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1852  T *p_lb, T *p_ub,
1853  typename traits_t<T>::signed_t *p_st
1854 #if OMPT_SUPPORT && OMPT_OPTIONAL
1855  ,
1856  void *codeptr
1857 #endif
1858  ) {
1859 
1860  typedef typename traits_t<T>::unsigned_t UT;
1861  typedef typename traits_t<T>::signed_t ST;
1862  // This is potentially slightly misleading, schedule(runtime) will appear here
1863  // even if the actual runtme schedule is static. (Which points out a
1864  // disadavantage of schedule(runtime): even when static scheduling is used it
1865  // costs more than a compile time choice to use static scheduling would.)
1866  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1867 
1868  int status;
1869  dispatch_private_info_template<T> *pr;
1870  kmp_info_t *th = __kmp_threads[gtid];
1871  kmp_team_t *team = th->th.th_team;
1872 
1873  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1874  KD_TRACE(
1875  1000,
1876  ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1877  gtid, p_lb, p_ub, p_st, p_last));
1878 
1879  if (team->t.t_serialized) {
1880  /* NOTE: serialize this dispatch becase we are not at the active level */
1881  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1882  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1883  KMP_DEBUG_ASSERT(pr);
1884 
1885  if ((status = (pr->u.p.tc != 0)) == 0) {
1886  *p_lb = 0;
1887  *p_ub = 0;
1888  // if ( p_last != NULL )
1889  // *p_last = 0;
1890  if (p_st != NULL)
1891  *p_st = 0;
1892  if (__kmp_env_consistency_check) {
1893  if (pr->pushed_ws != ct_none) {
1894  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1895  }
1896  }
1897  } else if (pr->flags.nomerge) {
1898  kmp_int32 last;
1899  T start;
1900  UT limit, trip, init;
1901  ST incr;
1902  T chunk = pr->u.p.parm1;
1903 
1904  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1905  gtid));
1906 
1907  init = chunk * pr->u.p.count++;
1908  trip = pr->u.p.tc - 1;
1909 
1910  if ((status = (init <= trip)) == 0) {
1911  *p_lb = 0;
1912  *p_ub = 0;
1913  // if ( p_last != NULL )
1914  // *p_last = 0;
1915  if (p_st != NULL)
1916  *p_st = 0;
1917  if (__kmp_env_consistency_check) {
1918  if (pr->pushed_ws != ct_none) {
1919  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1920  }
1921  }
1922  } else {
1923  start = pr->u.p.lb;
1924  limit = chunk + init - 1;
1925  incr = pr->u.p.st;
1926 
1927  if ((last = (limit >= trip)) != 0) {
1928  limit = trip;
1929 #if KMP_OS_WINDOWS
1930  pr->u.p.last_upper = pr->u.p.ub;
1931 #endif /* KMP_OS_WINDOWS */
1932  }
1933  if (p_last != NULL)
1934  *p_last = last;
1935  if (p_st != NULL)
1936  *p_st = incr;
1937  if (incr == 1) {
1938  *p_lb = start + init;
1939  *p_ub = start + limit;
1940  } else {
1941  *p_lb = start + init * incr;
1942  *p_ub = start + limit * incr;
1943  }
1944 
1945  if (pr->flags.ordered) {
1946  pr->u.p.ordered_lower = init;
1947  pr->u.p.ordered_upper = limit;
1948 #ifdef KMP_DEBUG
1949  {
1950  char *buff;
1951  // create format specifiers before the debug output
1952  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1953  "ordered_lower:%%%s ordered_upper:%%%s\n",
1954  traits_t<UT>::spec, traits_t<UT>::spec);
1955  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1956  pr->u.p.ordered_upper));
1957  __kmp_str_free(&buff);
1958  }
1959 #endif
1960  } // if
1961  } // if
1962  } else {
1963  pr->u.p.tc = 0;
1964  *p_lb = pr->u.p.lb;
1965  *p_ub = pr->u.p.ub;
1966 #if KMP_OS_WINDOWS
1967  pr->u.p.last_upper = *p_ub;
1968 #endif /* KMP_OS_WINDOWS */
1969  if (p_last != NULL)
1970  *p_last = TRUE;
1971  if (p_st != NULL)
1972  *p_st = pr->u.p.st;
1973  } // if
1974 #ifdef KMP_DEBUG
1975  {
1976  char *buff;
1977  // create format specifiers before the debug output
1978  buff = __kmp_str_format(
1979  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1980  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1981  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1982  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1983  __kmp_str_free(&buff);
1984  }
1985 #endif
1986 #if INCLUDE_SSC_MARKS
1987  SSC_MARK_DISPATCH_NEXT();
1988 #endif
1989  OMPT_LOOP_END;
1990  KMP_STATS_LOOP_END;
1991  return status;
1992  } else {
1993  kmp_int32 last = 0;
1994  dispatch_shared_info_template<T> volatile *sh;
1995 
1996  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1997  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1998 
1999  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2000  th->th.th_dispatch->th_dispatch_pr_current);
2001  KMP_DEBUG_ASSERT(pr);
2002  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2003  th->th.th_dispatch->th_dispatch_sh_current);
2004  KMP_DEBUG_ASSERT(sh);
2005 
2006 #if KMP_USE_HIER_SCHED
2007  if (pr->flags.use_hier)
2008  status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2009  else
2010 #endif // KMP_USE_HIER_SCHED
2011  status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2012  p_st, th->th.th_team_nproc,
2013  th->th.th_info.ds.ds_tid);
2014  // status == 0: no more iterations to execute
2015  if (status == 0) {
2016  UT num_done;
2017 
2018  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2019 #ifdef KMP_DEBUG
2020  {
2021  char *buff;
2022  // create format specifiers before the debug output
2023  buff = __kmp_str_format(
2024  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2025  traits_t<UT>::spec);
2026  KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2027  __kmp_str_free(&buff);
2028  }
2029 #endif
2030 
2031 #if KMP_USE_HIER_SCHED
2032  pr->flags.use_hier = FALSE;
2033 #endif
2034  if ((ST)num_done == th->th.th_team_nproc - 1) {
2035 #if (KMP_STATIC_STEAL_ENABLED)
2036  if (pr->schedule == kmp_sch_static_steal &&
2037  traits_t<T>::type_size > 4) {
2038  int i;
2039  kmp_info_t **other_threads = team->t.t_threads;
2040  // loop complete, safe to destroy locks used for stealing
2041  for (i = 0; i < th->th.th_team_nproc; ++i) {
2042  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2043  KMP_ASSERT(lck != NULL);
2044  __kmp_destroy_lock(lck);
2045  __kmp_free(lck);
2046  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2047  }
2048  }
2049 #endif
2050  /* NOTE: release this buffer to be reused */
2051 
2052  KMP_MB(); /* Flush all pending memory write invalidates. */
2053 
2054  sh->u.s.num_done = 0;
2055  sh->u.s.iteration = 0;
2056 
2057  /* TODO replace with general release procedure? */
2058  if (pr->flags.ordered) {
2059  sh->u.s.ordered_iteration = 0;
2060  }
2061 
2062  KMP_MB(); /* Flush all pending memory write invalidates. */
2063 
2064  sh->buffer_index += __kmp_dispatch_num_buffers;
2065  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2066  gtid, sh->buffer_index));
2067 
2068  KMP_MB(); /* Flush all pending memory write invalidates. */
2069 
2070  } // if
2071  if (__kmp_env_consistency_check) {
2072  if (pr->pushed_ws != ct_none) {
2073  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2074  }
2075  }
2076 
2077  th->th.th_dispatch->th_deo_fcn = NULL;
2078  th->th.th_dispatch->th_dxo_fcn = NULL;
2079  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2080  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2081  } // if (status == 0)
2082 #if KMP_OS_WINDOWS
2083  else if (last) {
2084  pr->u.p.last_upper = pr->u.p.ub;
2085  }
2086 #endif /* KMP_OS_WINDOWS */
2087  if (p_last != NULL && status != 0)
2088  *p_last = last;
2089  } // if
2090 
2091 #ifdef KMP_DEBUG
2092  {
2093  char *buff;
2094  // create format specifiers before the debug output
2095  buff = __kmp_str_format(
2096  "__kmp_dispatch_next: T#%%d normal case: "
2097  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2098  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2099  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2100  (p_last ? *p_last : 0), status));
2101  __kmp_str_free(&buff);
2102  }
2103 #endif
2104 #if INCLUDE_SSC_MARKS
2105  SSC_MARK_DISPATCH_NEXT();
2106 #endif
2107  OMPT_LOOP_END;
2108  KMP_STATS_LOOP_END;
2109  return status;
2110 }
2111 
2112 template <typename T>
2113 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2114  kmp_int32 *plastiter, T *plower, T *pupper,
2115  typename traits_t<T>::signed_t incr) {
2116  typedef typename traits_t<T>::unsigned_t UT;
2117  kmp_uint32 team_id;
2118  kmp_uint32 nteams;
2119  UT trip_count;
2120  kmp_team_t *team;
2121  kmp_info_t *th;
2122 
2123  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2124  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2125 #ifdef KMP_DEBUG
2126  typedef typename traits_t<T>::signed_t ST;
2127  {
2128  char *buff;
2129  // create format specifiers before the debug output
2130  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2131  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2132  traits_t<T>::spec, traits_t<T>::spec,
2133  traits_t<ST>::spec, traits_t<T>::spec);
2134  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2135  __kmp_str_free(&buff);
2136  }
2137 #endif
2138 
2139  if (__kmp_env_consistency_check) {
2140  if (incr == 0) {
2141  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2142  loc);
2143  }
2144  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2145  // The loop is illegal.
2146  // Some zero-trip loops maintained by compiler, e.g.:
2147  // for(i=10;i<0;++i) // lower >= upper - run-time check
2148  // for(i=0;i>10;--i) // lower <= upper - run-time check
2149  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2150  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2151  // Compiler does not check the following illegal loops:
2152  // for(i=0;i<10;i+=incr) // where incr<0
2153  // for(i=10;i>0;i-=incr) // where incr<0
2154  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2155  }
2156  }
2157  th = __kmp_threads[gtid];
2158  team = th->th.th_team;
2159 #if OMP_40_ENABLED
2160  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2161  nteams = th->th.th_teams_size.nteams;
2162 #endif
2163  team_id = team->t.t_master_tid;
2164  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2165 
2166  // compute global trip count
2167  if (incr == 1) {
2168  trip_count = *pupper - *plower + 1;
2169  } else if (incr == -1) {
2170  trip_count = *plower - *pupper + 1;
2171  } else if (incr > 0) {
2172  // upper-lower can exceed the limit of signed type
2173  trip_count = (UT)(*pupper - *plower) / incr + 1;
2174  } else {
2175  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2176  }
2177 
2178  if (trip_count <= nteams) {
2179  KMP_DEBUG_ASSERT(
2180  __kmp_static == kmp_sch_static_greedy ||
2181  __kmp_static ==
2182  kmp_sch_static_balanced); // Unknown static scheduling type.
2183  // only some teams get single iteration, others get nothing
2184  if (team_id < trip_count) {
2185  *pupper = *plower = *plower + team_id * incr;
2186  } else {
2187  *plower = *pupper + incr; // zero-trip loop
2188  }
2189  if (plastiter != NULL)
2190  *plastiter = (team_id == trip_count - 1);
2191  } else {
2192  if (__kmp_static == kmp_sch_static_balanced) {
2193  UT chunk = trip_count / nteams;
2194  UT extras = trip_count % nteams;
2195  *plower +=
2196  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2197  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2198  if (plastiter != NULL)
2199  *plastiter = (team_id == nteams - 1);
2200  } else {
2201  T chunk_inc_count =
2202  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2203  T upper = *pupper;
2204  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2205  // Unknown static scheduling type.
2206  *plower += team_id * chunk_inc_count;
2207  *pupper = *plower + chunk_inc_count - incr;
2208  // Check/correct bounds if needed
2209  if (incr > 0) {
2210  if (*pupper < *plower)
2211  *pupper = traits_t<T>::max_value;
2212  if (plastiter != NULL)
2213  *plastiter = *plower <= upper && *pupper > upper - incr;
2214  if (*pupper > upper)
2215  *pupper = upper; // tracker C73258
2216  } else {
2217  if (*pupper > *plower)
2218  *pupper = traits_t<T>::min_value;
2219  if (plastiter != NULL)
2220  *plastiter = *plower >= upper && *pupper < upper - incr;
2221  if (*pupper < upper)
2222  *pupper = upper; // tracker C73258
2223  }
2224  }
2225  }
2226 }
2227 
2228 //-----------------------------------------------------------------------------
2229 // Dispatch routines
2230 // Transfer call to template< type T >
2231 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2232 // T lb, T ub, ST st, ST chunk )
2233 extern "C" {
2234 
2251 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2252  enum sched_type schedule, kmp_int32 lb,
2253  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2254  KMP_DEBUG_ASSERT(__kmp_init_serial);
2255 #if OMPT_SUPPORT && OMPT_OPTIONAL
2256  OMPT_STORE_RETURN_ADDRESS(gtid);
2257 #endif
2258  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2259 }
2263 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2264  enum sched_type schedule, kmp_uint32 lb,
2265  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2266  KMP_DEBUG_ASSERT(__kmp_init_serial);
2267 #if OMPT_SUPPORT && OMPT_OPTIONAL
2268  OMPT_STORE_RETURN_ADDRESS(gtid);
2269 #endif
2270  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2271 }
2272 
2276 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2277  enum sched_type schedule, kmp_int64 lb,
2278  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2279  KMP_DEBUG_ASSERT(__kmp_init_serial);
2280 #if OMPT_SUPPORT && OMPT_OPTIONAL
2281  OMPT_STORE_RETURN_ADDRESS(gtid);
2282 #endif
2283  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2284 }
2285 
2289 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2290  enum sched_type schedule, kmp_uint64 lb,
2291  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2292  KMP_DEBUG_ASSERT(__kmp_init_serial);
2293 #if OMPT_SUPPORT && OMPT_OPTIONAL
2294  OMPT_STORE_RETURN_ADDRESS(gtid);
2295 #endif
2296  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2297 }
2298 
2308 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2309  enum sched_type schedule, kmp_int32 *p_last,
2310  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2311  kmp_int32 chunk) {
2312  KMP_DEBUG_ASSERT(__kmp_init_serial);
2313 #if OMPT_SUPPORT && OMPT_OPTIONAL
2314  OMPT_STORE_RETURN_ADDRESS(gtid);
2315 #endif
2316  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2317  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2318 }
2319 
2320 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2321  enum sched_type schedule, kmp_int32 *p_last,
2322  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2323  kmp_int32 chunk) {
2324  KMP_DEBUG_ASSERT(__kmp_init_serial);
2325 #if OMPT_SUPPORT && OMPT_OPTIONAL
2326  OMPT_STORE_RETURN_ADDRESS(gtid);
2327 #endif
2328  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2329  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2330 }
2331 
2332 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2333  enum sched_type schedule, kmp_int32 *p_last,
2334  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2335  kmp_int64 chunk) {
2336  KMP_DEBUG_ASSERT(__kmp_init_serial);
2337 #if OMPT_SUPPORT && OMPT_OPTIONAL
2338  OMPT_STORE_RETURN_ADDRESS(gtid);
2339 #endif
2340  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2341  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2342 }
2343 
2344 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2345  enum sched_type schedule, kmp_int32 *p_last,
2346  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2347  kmp_int64 chunk) {
2348  KMP_DEBUG_ASSERT(__kmp_init_serial);
2349 #if OMPT_SUPPORT && OMPT_OPTIONAL
2350  OMPT_STORE_RETURN_ADDRESS(gtid);
2351 #endif
2352  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2353  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2354 }
2355 
2369 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2370  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2371 #if OMPT_SUPPORT && OMPT_OPTIONAL
2372  OMPT_STORE_RETURN_ADDRESS(gtid);
2373 #endif
2374  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL
2376  ,
2377  OMPT_LOAD_RETURN_ADDRESS(gtid)
2378 #endif
2379  );
2380 }
2381 
2385 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2386  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2387  kmp_int32 *p_st) {
2388 #if OMPT_SUPPORT && OMPT_OPTIONAL
2389  OMPT_STORE_RETURN_ADDRESS(gtid);
2390 #endif
2391  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL
2393  ,
2394  OMPT_LOAD_RETURN_ADDRESS(gtid)
2395 #endif
2396  );
2397 }
2398 
2402 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2403  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL
2405  OMPT_STORE_RETURN_ADDRESS(gtid);
2406 #endif
2407  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2408 #if OMPT_SUPPORT && OMPT_OPTIONAL
2409  ,
2410  OMPT_LOAD_RETURN_ADDRESS(gtid)
2411 #endif
2412  );
2413 }
2414 
2418 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2419  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2420  kmp_int64 *p_st) {
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL
2422  OMPT_STORE_RETURN_ADDRESS(gtid);
2423 #endif
2424  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL
2426  ,
2427  OMPT_LOAD_RETURN_ADDRESS(gtid)
2428 #endif
2429  );
2430 }
2431 
2438 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2439  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2440 }
2441 
2445 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2446  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2447 }
2448 
2452 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2453  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2454 }
2455 
2459 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2460  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2461 }
2464 //-----------------------------------------------------------------------------
2465 // Non-template routines from kmp_dispatch.cpp used in other sources
2466 
2467 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2468  return value == checker;
2469 }
2470 
2471 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2472  return value != checker;
2473 }
2474 
2475 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2476  return value < checker;
2477 }
2478 
2479 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2480  return value >= checker;
2481 }
2482 
2483 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2484  return value <= checker;
2485 }
2486 
2487 kmp_uint32
2488 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2489  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2490  void *obj // Higher-level synchronization object, or NULL.
2491  ) {
2492  // note: we may not belong to a team at this point
2493  volatile kmp_uint32 *spin = spinner;
2494  kmp_uint32 check = checker;
2495  kmp_uint32 spins;
2496  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2497  kmp_uint32 r;
2498 
2499  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2500  KMP_INIT_YIELD(spins);
2501  // main wait spin loop
2502  while (!f(r = TCR_4(*spin), check)) {
2503  KMP_FSYNC_SPIN_PREPARE(obj);
2504  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2505  split. It causes problems with infinite recursion because of exit lock */
2506  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2507  __kmp_abort_thread(); */
2508 
2509  /* if we have waited a bit, or are oversubscribed, yield */
2510  /* pause is in the following code */
2511  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2512  KMP_YIELD_SPIN(spins);
2513  }
2514  KMP_FSYNC_SPIN_ACQUIRED(obj);
2515  return r;
2516 }
2517 
2518 void __kmp_wait_yield_4_ptr(
2519  void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2520  void *obj // Higher-level synchronization object, or NULL.
2521  ) {
2522  // note: we may not belong to a team at this point
2523  void *spin = spinner;
2524  kmp_uint32 check = checker;
2525  kmp_uint32 spins;
2526  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2527 
2528  KMP_FSYNC_SPIN_INIT(obj, spin);
2529  KMP_INIT_YIELD(spins);
2530  // main wait spin loop
2531  while (!f(spin, check)) {
2532  KMP_FSYNC_SPIN_PREPARE(obj);
2533  /* if we have waited a bit, or are oversubscribed, yield */
2534  /* pause is in the following code */
2535  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2536  KMP_YIELD_SPIN(spins);
2537  }
2538  KMP_FSYNC_SPIN_ACQUIRED(obj);
2539 }
2540 
2541 } // extern "C"
2542 
2543 #ifdef KMP_GOMP_COMPAT
2544 
2545 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2546  enum sched_type schedule, kmp_int32 lb,
2547  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2548  int push_ws) {
2549  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2550  push_ws);
2551 }
2552 
2553 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2554  enum sched_type schedule, kmp_uint32 lb,
2555  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2556  int push_ws) {
2557  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2558  push_ws);
2559 }
2560 
2561 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2562  enum sched_type schedule, kmp_int64 lb,
2563  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2564  int push_ws) {
2565  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2566  push_ws);
2567 }
2568 
2569 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2570  enum sched_type schedule, kmp_uint64 lb,
2571  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2572  int push_ws) {
2573  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2574  push_ws);
2575 }
2576 
2577 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2578  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2579 }
2580 
2581 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2582  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2583 }
2584 
2585 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2586  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2587 }
2588 
2589 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2590  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2591 }
2592 
2593 #endif /* KMP_GOMP_COMPAT */
2594 
2595 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:890
sched_type
Definition: kmp.h:337
Definition: kmp.h:224
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)