LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  * it may change values between parallel regions. __kmp_max_nth
18  * is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 #include "kmp_lock.h"
38 #include "kmp_dispatch.h"
39 #if KMP_USE_HIER_SCHED
40 #include "kmp_dispatch_hier.h"
41 #endif
42 
43 #if OMPT_SUPPORT
44 #include "ompt-specific.h"
45 #endif
46 
47 /* ------------------------------------------------------------------------ */
48 /* ------------------------------------------------------------------------ */
49 
50 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
51  kmp_info_t *th;
52 
53  KMP_DEBUG_ASSERT(gtid_ref);
54 
55  if (__kmp_env_consistency_check) {
56  th = __kmp_threads[*gtid_ref];
57  if (th->th.th_root->r.r_active &&
58  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
59 #if KMP_USE_DYNAMIC_LOCK
60  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
61 #else
62  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
63 #endif
64  }
65  }
66 }
67 
68 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
69  kmp_info_t *th;
70 
71  if (__kmp_env_consistency_check) {
72  th = __kmp_threads[*gtid_ref];
73  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
74  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
75  }
76  }
77 }
78 
79 // Initialize a dispatch_private_info_template<T> buffer for a particular
80 // type of schedule,chunk. The loop description is found in lb (lower bound),
81 // ub (upper bound), and st (stride). nproc is the number of threads relevant
82 // to the scheduling (often the number of threads in a team, but not always if
83 // hierarchical scheduling is used). tid is the id of the thread calling
84 // the function within the group of nproc threads. It will have a value
85 // between 0 and nproc - 1. This is often just the thread id within a team, but
86 // is not necessarily the case when using hierarchical scheduling.
87 // loc is the source file location of the corresponding loop
88 // gtid is the global thread id
89 template <typename T>
90 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
91  dispatch_private_info_template<T> *pr,
92  enum sched_type schedule, T lb, T ub,
93  typename traits_t<T>::signed_t st,
94 #if USE_ITT_BUILD
95  kmp_uint64 *cur_chunk,
96 #endif
97  typename traits_t<T>::signed_t chunk,
98  T nproc, T tid) {
99  typedef typename traits_t<T>::unsigned_t UT;
100  typedef typename traits_t<T>::signed_t ST;
101  typedef typename traits_t<T>::floating_t DBL;
102 
103  int active;
104  T tc;
105  kmp_info_t *th;
106  kmp_team_t *team;
107 
108 #ifdef KMP_DEBUG
109  {
110  char *buff;
111  // create format specifiers before the debug output
112  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
113  "pr:%%p lb:%%%s ub:%%%s st:%%%s "
114  "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
115  traits_t<T>::spec, traits_t<T>::spec,
116  traits_t<ST>::spec, traits_t<ST>::spec,
117  traits_t<T>::spec, traits_t<T>::spec);
118  KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
119  __kmp_str_free(&buff);
120  }
121 #endif
122  /* setup data */
123  th = __kmp_threads[gtid];
124  team = th->th.th_team;
125  active = !team->t.t_serialized;
126 
127 #if USE_ITT_BUILD
128  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
129  __kmp_forkjoin_frames_mode == 3 &&
130  KMP_MASTER_GTID(gtid) &&
131 #if OMP_40_ENABLED
132  th->th.th_teams_microtask == NULL &&
133 #endif
134  team->t.t_active_level == 1;
135 #endif
136 #if (KMP_STATIC_STEAL_ENABLED)
137  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
138  // AC: we now have only one implementation of stealing, so use it
139  schedule = kmp_sch_static_steal;
140  else
141 #endif
142  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
143 
144  /* Pick up the nomerge/ordered bits from the scheduling type */
145  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
146  pr->flags.nomerge = TRUE;
147  schedule =
148  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
149  } else {
150  pr->flags.nomerge = FALSE;
151  }
152  pr->type_size = traits_t<T>::type_size; // remember the size of variables
153  if (kmp_ord_lower & schedule) {
154  pr->flags.ordered = TRUE;
155  schedule =
156  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
157  } else {
158  pr->flags.ordered = FALSE;
159  }
160 
161  if (schedule == kmp_sch_static) {
162  schedule = __kmp_static;
163  } else {
164  if (schedule == kmp_sch_runtime) {
165  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
166  // not specified)
167  schedule = team->t.t_sched.r_sched_type;
168  // Detail the schedule if needed (global controls are differentiated
169  // appropriately)
170  if (schedule == kmp_sch_guided_chunked) {
171  schedule = __kmp_guided;
172  } else if (schedule == kmp_sch_static) {
173  schedule = __kmp_static;
174  }
175  // Use the chunk size specified by OMP_SCHEDULE (or default if not
176  // specified)
177  chunk = team->t.t_sched.chunk;
178 #if USE_ITT_BUILD
179  if (cur_chunk)
180  *cur_chunk = chunk;
181 #endif
182 #ifdef KMP_DEBUG
183  {
184  char *buff;
185  // create format specifiers before the debug output
186  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
187  "schedule:%%d chunk:%%%s\n",
188  traits_t<ST>::spec);
189  KD_TRACE(10, (buff, gtid, schedule, chunk));
190  __kmp_str_free(&buff);
191  }
192 #endif
193  } else {
194  if (schedule == kmp_sch_guided_chunked) {
195  schedule = __kmp_guided;
196  }
197  if (chunk <= 0) {
198  chunk = KMP_DEFAULT_CHUNK;
199  }
200  }
201 
202  if (schedule == kmp_sch_auto) {
203  // mapping and differentiation: in the __kmp_do_serial_initialize()
204  schedule = __kmp_auto;
205 #ifdef KMP_DEBUG
206  {
207  char *buff;
208  // create format specifiers before the debug output
209  buff = __kmp_str_format(
210  "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
211  "schedule:%%d chunk:%%%s\n",
212  traits_t<ST>::spec);
213  KD_TRACE(10, (buff, gtid, schedule, chunk));
214  __kmp_str_free(&buff);
215  }
216 #endif
217  }
218 
219  /* guided analytical not safe for too many threads */
220  if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
221  schedule = kmp_sch_guided_iterative_chunked;
222  KMP_WARNING(DispatchManyThreads);
223  }
224 #if OMP_45_ENABLED
225  if (schedule == kmp_sch_runtime_simd) {
226  // compiler provides simd_width in the chunk parameter
227  schedule = team->t.t_sched.r_sched_type;
228  // Detail the schedule if needed (global controls are differentiated
229  // appropriately)
230  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
231  schedule == __kmp_static) {
232  schedule = kmp_sch_static_balanced_chunked;
233  } else {
234  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
235  schedule = kmp_sch_guided_simd;
236  }
237  chunk = team->t.t_sched.chunk * chunk;
238  }
239 #if USE_ITT_BUILD
240  if (cur_chunk)
241  *cur_chunk = chunk;
242 #endif
243 #ifdef KMP_DEBUG
244  {
245  char *buff;
246  // create format specifiers before the debug output
247  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
248  " chunk:%%%s\n",
249  traits_t<ST>::spec);
250  KD_TRACE(10, (buff, gtid, schedule, chunk));
251  __kmp_str_free(&buff);
252  }
253 #endif
254  }
255 #endif // OMP_45_ENABLED
256  pr->u.p.parm1 = chunk;
257  }
258  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
259  "unknown scheduling type");
260 
261  pr->u.p.count = 0;
262 
263  if (__kmp_env_consistency_check) {
264  if (st == 0) {
265  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
266  (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
267  }
268  }
269  // compute trip count
270  if (st == 1) { // most common case
271  if (ub >= lb) {
272  tc = ub - lb + 1;
273  } else { // ub < lb
274  tc = 0; // zero-trip
275  }
276  } else if (st < 0) {
277  if (lb >= ub) {
278  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
279  // where the division needs to be unsigned regardless of the result type
280  tc = (UT)(lb - ub) / (-st) + 1;
281  } else { // lb < ub
282  tc = 0; // zero-trip
283  }
284  } else { // st > 0
285  if (ub >= lb) {
286  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
287  // where the division needs to be unsigned regardless of the result type
288  tc = (UT)(ub - lb) / st + 1;
289  } else { // ub < lb
290  tc = 0; // zero-trip
291  }
292  }
293 
294  pr->u.p.lb = lb;
295  pr->u.p.ub = ub;
296  pr->u.p.st = st;
297  pr->u.p.tc = tc;
298 
299 #if KMP_OS_WINDOWS
300  pr->u.p.last_upper = ub + st;
301 #endif /* KMP_OS_WINDOWS */
302 
303  /* NOTE: only the active parallel region(s) has active ordered sections */
304 
305  if (active) {
306  if (pr->flags.ordered) {
307  pr->ordered_bumped = 0;
308  pr->u.p.ordered_lower = 1;
309  pr->u.p.ordered_upper = 0;
310  }
311  }
312 
313  switch (schedule) {
314 #if (KMP_STATIC_STEAL_ENABLED)
315  case kmp_sch_static_steal: {
316  T ntc, init;
317 
318  KD_TRACE(100,
319  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
320  gtid));
321 
322  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
323  if (nproc > 1 && ntc >= nproc) {
324  KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
325  T id = tid;
326  T small_chunk, extras;
327 
328  small_chunk = ntc / nproc;
329  extras = ntc % nproc;
330 
331  init = id * small_chunk + (id < extras ? id : extras);
332  pr->u.p.count = init;
333  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
334 
335  pr->u.p.parm2 = lb;
336  // pr->pfields.parm3 = 0; // it's not used in static_steal
337  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
338  pr->u.p.st = st;
339  if (traits_t<T>::type_size > 4) {
340  // AC: TODO: check if 16-byte CAS available and use it to
341  // improve performance (probably wait for explicit request
342  // before spending time on this).
343  // For now use dynamically allocated per-thread lock,
344  // free memory in __kmp_dispatch_next when status==0.
345  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
346  th->th.th_dispatch->th_steal_lock =
347  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
348  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
349  }
350  break;
351  } else {
352  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
353  "kmp_sch_static_balanced\n",
354  gtid));
355  schedule = kmp_sch_static_balanced;
356  /* too few iterations: fall-through to kmp_sch_static_balanced */
357  } // if
358  /* FALL-THROUGH to static balanced */
359  } // case
360 #endif
361  case kmp_sch_static_balanced: {
362  T init, limit;
363 
364  KD_TRACE(
365  100,
366  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
367  gtid));
368 
369  if (nproc > 1) {
370  T id = tid;
371 
372  if (tc < nproc) {
373  if (id < tc) {
374  init = id;
375  limit = id;
376  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
377  } else {
378  pr->u.p.count = 1; /* means no more chunks to execute */
379  pr->u.p.parm1 = FALSE;
380  break;
381  }
382  } else {
383  T small_chunk = tc / nproc;
384  T extras = tc % nproc;
385  init = id * small_chunk + (id < extras ? id : extras);
386  limit = init + small_chunk - (id < extras ? 0 : 1);
387  pr->u.p.parm1 = (id == nproc - 1);
388  }
389  } else {
390  if (tc > 0) {
391  init = 0;
392  limit = tc - 1;
393  pr->u.p.parm1 = TRUE;
394  } else {
395  // zero trip count
396  pr->u.p.count = 1; /* means no more chunks to execute */
397  pr->u.p.parm1 = FALSE;
398  break;
399  }
400  }
401 #if USE_ITT_BUILD
402  // Calculate chunk for metadata report
403  if (itt_need_metadata_reporting)
404  if (cur_chunk)
405  *cur_chunk = limit - init + 1;
406 #endif
407  if (st == 1) {
408  pr->u.p.lb = lb + init;
409  pr->u.p.ub = lb + limit;
410  } else {
411  // calculated upper bound, "ub" is user-defined upper bound
412  T ub_tmp = lb + limit * st;
413  pr->u.p.lb = lb + init * st;
414  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
415  // it exactly
416  if (st > 0) {
417  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
418  } else {
419  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
420  }
421  }
422  if (pr->flags.ordered) {
423  pr->u.p.ordered_lower = init;
424  pr->u.p.ordered_upper = limit;
425  }
426  break;
427  } // case
428 #if OMP_45_ENABLED
429  case kmp_sch_static_balanced_chunked: {
430  // similar to balanced, but chunk adjusted to multiple of simd width
431  T nth = nproc;
432  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
433  " -> falling-through to static_greedy\n",
434  gtid));
435  schedule = kmp_sch_static_greedy;
436  if (nth > 1)
437  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
438  else
439  pr->u.p.parm1 = tc;
440  break;
441  } // case
442  case kmp_sch_guided_simd:
443 #endif // OMP_45_ENABLED
444  case kmp_sch_guided_iterative_chunked: {
445  KD_TRACE(
446  100,
447  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
448  " case\n",
449  gtid));
450 
451  if (nproc > 1) {
452  if ((2L * chunk + 1) * nproc >= tc) {
453  /* chunk size too large, switch to dynamic */
454  schedule = kmp_sch_dynamic_chunked;
455  } else {
456  // when remaining iters become less than parm2 - switch to dynamic
457  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
458  *(double *)&pr->u.p.parm3 =
459  guided_flt_param / nproc; // may occupy parm3 and parm4
460  }
461  } else {
462  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
463  "kmp_sch_static_greedy\n",
464  gtid));
465  schedule = kmp_sch_static_greedy;
466  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
467  KD_TRACE(
468  100,
469  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
470  gtid));
471  pr->u.p.parm1 = tc;
472  } // if
473  } // case
474  break;
475  case kmp_sch_guided_analytical_chunked: {
476  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
477  "kmp_sch_guided_analytical_chunked case\n",
478  gtid));
479 
480  if (nproc > 1) {
481  if ((2L * chunk + 1) * nproc >= tc) {
482  /* chunk size too large, switch to dynamic */
483  schedule = kmp_sch_dynamic_chunked;
484  } else {
485  /* commonly used term: (2 nproc - 1)/(2 nproc) */
486  DBL x;
487 
488 #if KMP_OS_WINDOWS && KMP_ARCH_X86
489  /* Linux* OS already has 64-bit computation by default for long double,
490  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
491  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
492  instead of the default 53-bit. Even though long double doesn't work
493  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
494  expected to impact the correctness of the algorithm, but this has not
495  been mathematically proven. */
496  // save original FPCW and set precision to 64-bit, as
497  // Windows* OS on IA-32 architecture defaults to 53-bit
498  unsigned int oldFpcw = _control87(0, 0);
499  _control87(_PC_64, _MCW_PC); // 0,0x30000
500 #endif
501  /* value used for comparison in solver for cross-over point */
502  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
503 
504  /* crossover point--chunk indexes equal to or greater than
505  this point switch to dynamic-style scheduling */
506  UT cross;
507 
508  /* commonly used term: (2 nproc - 1)/(2 nproc) */
509  x = (long double)1.0 - (long double)0.5 / nproc;
510 
511 #ifdef KMP_DEBUG
512  { // test natural alignment
513  struct _test_a {
514  char a;
515  union {
516  char b;
517  DBL d;
518  };
519  } t;
520  ptrdiff_t natural_alignment =
521  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
522  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
523  // long)natural_alignment );
524  KMP_DEBUG_ASSERT(
525  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
526  }
527 #endif // KMP_DEBUG
528 
529  /* save the term in thread private dispatch structure */
530  *(DBL *)&pr->u.p.parm3 = x;
531 
532  /* solve for the crossover point to the nearest integer i for which C_i
533  <= chunk */
534  {
535  UT left, right, mid;
536  long double p;
537 
538  /* estimate initial upper and lower bound */
539 
540  /* doesn't matter what value right is as long as it is positive, but
541  it affects performance of the solver */
542  right = 229;
543  p = __kmp_pow<UT>(x, right);
544  if (p > target) {
545  do {
546  p *= p;
547  right <<= 1;
548  } while (p > target && right < (1 << 27));
549  /* lower bound is previous (failed) estimate of upper bound */
550  left = right >> 1;
551  } else {
552  left = 0;
553  }
554 
555  /* bisection root-finding method */
556  while (left + 1 < right) {
557  mid = (left + right) / 2;
558  if (__kmp_pow<UT>(x, mid) > target) {
559  left = mid;
560  } else {
561  right = mid;
562  }
563  } // while
564  cross = right;
565  }
566  /* assert sanity of computed crossover point */
567  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
568  __kmp_pow<UT>(x, cross) <= target);
569 
570  /* save the crossover point in thread private dispatch structure */
571  pr->u.p.parm2 = cross;
572 
573 // C75803
574 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
575 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
576 #else
577 #define GUIDED_ANALYTICAL_WORKAROUND (x)
578 #endif
579  /* dynamic-style scheduling offset */
580  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
581  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
582  cross * chunk;
583 #if KMP_OS_WINDOWS && KMP_ARCH_X86
584  // restore FPCW
585  _control87(oldFpcw, _MCW_PC);
586 #endif
587  } // if
588  } else {
589  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
590  "kmp_sch_static_greedy\n",
591  gtid));
592  schedule = kmp_sch_static_greedy;
593  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
594  pr->u.p.parm1 = tc;
595  } // if
596  } // case
597  break;
598  case kmp_sch_static_greedy:
599  KD_TRACE(
600  100,
601  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
602  gtid));
603  pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
604  break;
605  case kmp_sch_static_chunked:
606  case kmp_sch_dynamic_chunked:
607  if (pr->u.p.parm1 <= 0) {
608  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
609  }
610  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
611  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
612  gtid));
613  break;
614  case kmp_sch_trapezoidal: {
615  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
616 
617  T parm1, parm2, parm3, parm4;
618  KD_TRACE(100,
619  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
620  gtid));
621 
622  parm1 = chunk;
623 
624  /* F : size of the first cycle */
625  parm2 = (tc / (2 * nproc));
626 
627  if (parm2 < 1) {
628  parm2 = 1;
629  }
630 
631  /* L : size of the last cycle. Make sure the last cycle is not larger
632  than the first cycle. */
633  if (parm1 < 1) {
634  parm1 = 1;
635  } else if (parm1 > parm2) {
636  parm1 = parm2;
637  }
638 
639  /* N : number of cycles */
640  parm3 = (parm2 + parm1);
641  parm3 = (2 * tc + parm3 - 1) / parm3;
642 
643  if (parm3 < 2) {
644  parm3 = 2;
645  }
646 
647  /* sigma : decreasing incr of the trapezoid */
648  parm4 = (parm3 - 1);
649  parm4 = (parm2 - parm1) / parm4;
650 
651  // pointless check, because parm4 >= 0 always
652  // if ( parm4 < 0 ) {
653  // parm4 = 0;
654  //}
655 
656  pr->u.p.parm1 = parm1;
657  pr->u.p.parm2 = parm2;
658  pr->u.p.parm3 = parm3;
659  pr->u.p.parm4 = parm4;
660  } // case
661  break;
662 
663  default: {
664  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
665  KMP_HNT(GetNewerLibrary), // Hint
666  __kmp_msg_null // Variadic argument list terminator
667  );
668  } break;
669  } // switch
670  pr->schedule = schedule;
671 }
672 
673 #if KMP_USE_HIER_SCHED
674 template <typename T>
675 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
676  typename traits_t<T>::signed_t st);
677 template <>
678 inline void
679 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
680  kmp_int32 ub, kmp_int32 st) {
681  __kmp_dispatch_init_hierarchy<kmp_int32>(
682  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
683  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
684 }
685 template <>
686 inline void
687 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
688  kmp_uint32 ub, kmp_int32 st) {
689  __kmp_dispatch_init_hierarchy<kmp_uint32>(
690  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
691  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
692 }
693 template <>
694 inline void
695 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
696  kmp_int64 ub, kmp_int64 st) {
697  __kmp_dispatch_init_hierarchy<kmp_int64>(
698  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
699  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
700 }
701 template <>
702 inline void
703 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
704  kmp_uint64 ub, kmp_int64 st) {
705  __kmp_dispatch_init_hierarchy<kmp_uint64>(
706  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
707  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
708 }
709 
710 // free all the hierarchy scheduling memory associated with the team
711 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
712  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
713  for (int i = 0; i < num_disp_buff; ++i) {
714  // type does not matter here so use kmp_int32
715  auto sh =
716  reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
717  &team->t.t_disp_buffer[i]);
718  if (sh->hier) {
719  sh->hier->deallocate();
720  __kmp_free(sh->hier);
721  }
722  }
723 }
724 #endif
725 
726 // UT - unsigned flavor of T, ST - signed flavor of T,
727 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
728 template <typename T>
729 static void
730 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
731  T ub, typename traits_t<T>::signed_t st,
732  typename traits_t<T>::signed_t chunk, int push_ws) {
733  typedef typename traits_t<T>::unsigned_t UT;
734  typedef typename traits_t<T>::signed_t ST;
735  typedef typename traits_t<T>::floating_t DBL;
736 
737  int active;
738  kmp_info_t *th;
739  kmp_team_t *team;
740  kmp_uint32 my_buffer_index;
741  dispatch_private_info_template<T> *pr;
742  dispatch_shared_info_template<T> volatile *sh;
743 
744  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
745  sizeof(dispatch_private_info));
746  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
747  sizeof(dispatch_shared_info));
748 
749  if (!TCR_4(__kmp_init_parallel))
750  __kmp_parallel_initialize();
751 
752 #if INCLUDE_SSC_MARKS
753  SSC_MARK_DISPATCH_INIT();
754 #endif
755 #ifdef KMP_DEBUG
756  {
757  char *buff;
758  // create format specifiers before the debug output
759  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
760  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
761  traits_t<ST>::spec, traits_t<T>::spec,
762  traits_t<T>::spec, traits_t<ST>::spec);
763  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
764  __kmp_str_free(&buff);
765  }
766 #endif
767  /* setup data */
768  th = __kmp_threads[gtid];
769  team = th->th.th_team;
770  active = !team->t.t_serialized;
771  th->th.th_ident = loc;
772 
773  // Any half-decent optimizer will remove this test when the blocks are empty
774  // since the macros expand to nothing
775  // when statistics are disabled.
776  if (schedule == __kmp_static) {
777  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
778  } else {
779  KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
780  }
781 
782 #if KMP_USE_HIER_SCHED
783  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
784  // Hierarchical scheduling does not work with ordered, so if ordered is
785  // detected, then revert back to threaded scheduling.
786  bool ordered;
787  enum sched_type my_sched = schedule;
788  my_buffer_index = th->th.th_dispatch->th_disp_index;
789  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
790  &th->th.th_dispatch
791  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
792  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
793  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
794  my_sched =
795  (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
796  ordered = (kmp_ord_lower & my_sched);
797  if (pr->flags.use_hier) {
798  if (ordered) {
799  KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
800  "Disabling hierarchical scheduling.\n",
801  gtid));
802  pr->flags.use_hier = FALSE;
803  }
804  }
805  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
806  // Don't use hierarchical for ordered parallel loops and don't
807  // use the runtime hierarchy if one was specified in the program
808  if (!ordered && !pr->flags.use_hier)
809  __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
810  }
811 #endif // KMP_USE_HIER_SCHED
812 
813 #if USE_ITT_BUILD
814  kmp_uint64 cur_chunk = chunk;
815  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
816  __kmp_forkjoin_frames_mode == 3 &&
817  KMP_MASTER_GTID(gtid) &&
818 #if OMP_40_ENABLED
819  th->th.th_teams_microtask == NULL &&
820 #endif
821  team->t.t_active_level == 1;
822 #endif
823  if (!active) {
824  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
825  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
826  } else {
827  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
828  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
829 
830  my_buffer_index = th->th.th_dispatch->th_disp_index++;
831 
832  /* What happens when number of threads changes, need to resize buffer? */
833  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
834  &th->th.th_dispatch
835  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
836  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
837  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
838  KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
839  my_buffer_index));
840  }
841 
842  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
843 #if USE_ITT_BUILD
844  &cur_chunk,
845 #endif
846  chunk, (T)th->th.th_team_nproc,
847  (T)th->th.th_info.ds.ds_tid);
848  if (active) {
849  if (pr->flags.ordered == 0) {
850  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
851  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
852  } else {
853  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
854  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
855  }
856  }
857 
858  if (active) {
859  /* The name of this buffer should be my_buffer_index when it's free to use
860  * it */
861 
862  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
863  "sh->buffer_index:%d\n",
864  gtid, my_buffer_index, sh->buffer_index));
865  __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
866  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
867  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
868  // my_buffer_index are *always* 32-bit integers.
869  KMP_MB(); /* is this necessary? */
870  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
871  "sh->buffer_index:%d\n",
872  gtid, my_buffer_index, sh->buffer_index));
873 
874  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
875  th->th.th_dispatch->th_dispatch_sh_current =
876  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
877 #if USE_ITT_BUILD
878  if (pr->flags.ordered) {
879  __kmp_itt_ordered_init(gtid);
880  }
881  // Report loop metadata
882  if (itt_need_metadata_reporting) {
883  // Only report metadata by master of active team at level 1
884  kmp_uint64 schedtype = 0;
885  switch (schedule) {
886  case kmp_sch_static_chunked:
887  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
888  break;
889  case kmp_sch_static_greedy:
890  cur_chunk = pr->u.p.parm1;
891  break;
892  case kmp_sch_dynamic_chunked:
893  schedtype = 1;
894  break;
895  case kmp_sch_guided_iterative_chunked:
896  case kmp_sch_guided_analytical_chunked:
897 #if OMP_45_ENABLED
898  case kmp_sch_guided_simd:
899 #endif
900  schedtype = 2;
901  break;
902  default:
903  // Should we put this case under "static"?
904  // case kmp_sch_static_steal:
905  schedtype = 3;
906  break;
907  }
908  __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
909  }
910 #if KMP_USE_HIER_SCHED
911  if (pr->flags.use_hier) {
912  pr->u.p.count = 0;
913  pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
914  }
915 #endif // KMP_USER_HIER_SCHED
916 #endif /* USE_ITT_BUILD */
917  }
918 
919 #ifdef KMP_DEBUG
920  {
921  char *buff;
922  // create format specifiers before the debug output
923  buff = __kmp_str_format(
924  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
925  "lb:%%%s ub:%%%s"
926  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
927  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
928  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
929  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
930  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
931  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
932  KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
933  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
934  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
935  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
936  __kmp_str_free(&buff);
937  }
938 #endif
939 #if (KMP_STATIC_STEAL_ENABLED)
940  // It cannot be guaranteed that after execution of a loop with some other
941  // schedule kind all the parm3 variables will contain the same value. Even if
942  // all parm3 will be the same, it still exists a bad case like using 0 and 1
943  // rather than program life-time increment. So the dedicated variable is
944  // required. The 'static_steal_counter' is used.
945  if (schedule == kmp_sch_static_steal) {
946  // Other threads will inspect this variable when searching for a victim.
947  // This is a flag showing that other threads may steal from this thread
948  // since then.
949  volatile T *p = &pr->u.p.static_steal_counter;
950  *p = *p + 1;
951  }
952 #endif // ( KMP_STATIC_STEAL_ENABLED )
953 
954 #if OMPT_SUPPORT && OMPT_OPTIONAL
955  if (ompt_enabled.ompt_callback_work) {
956  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
957  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
958  ompt_callbacks.ompt_callback(ompt_callback_work)(
959  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
960  &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
961  }
962 #endif
963  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
964 }
965 
966 /* For ordered loops, either __kmp_dispatch_finish() should be called after
967  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
968  * every chunk of iterations. If the ordered section(s) were not executed
969  * for this iteration (or every iteration in this chunk), we need to set the
970  * ordered iteration counters so that the next thread can proceed. */
971 template <typename UT>
972 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
973  typedef typename traits_t<UT>::signed_t ST;
974  kmp_info_t *th = __kmp_threads[gtid];
975 
976  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
977  if (!th->th.th_team->t.t_serialized) {
978 
979  dispatch_private_info_template<UT> *pr =
980  reinterpret_cast<dispatch_private_info_template<UT> *>(
981  th->th.th_dispatch->th_dispatch_pr_current);
982  dispatch_shared_info_template<UT> volatile *sh =
983  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
984  th->th.th_dispatch->th_dispatch_sh_current);
985  KMP_DEBUG_ASSERT(pr);
986  KMP_DEBUG_ASSERT(sh);
987  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
988  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
989 
990  if (pr->ordered_bumped) {
991  KD_TRACE(
992  1000,
993  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
994  gtid));
995  pr->ordered_bumped = 0;
996  } else {
997  UT lower = pr->u.p.ordered_lower;
998 
999 #ifdef KMP_DEBUG
1000  {
1001  char *buff;
1002  // create format specifiers before the debug output
1003  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1004  "ordered_iteration:%%%s lower:%%%s\n",
1005  traits_t<UT>::spec, traits_t<UT>::spec);
1006  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1007  __kmp_str_free(&buff);
1008  }
1009 #endif
1010 
1011  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1012  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1013  KMP_MB(); /* is this necessary? */
1014 #ifdef KMP_DEBUG
1015  {
1016  char *buff;
1017  // create format specifiers before the debug output
1018  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1019  "ordered_iteration:%%%s lower:%%%s\n",
1020  traits_t<UT>::spec, traits_t<UT>::spec);
1021  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1022  __kmp_str_free(&buff);
1023  }
1024 #endif
1025 
1026  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1027  } // if
1028  } // if
1029  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1030 }
1031 
1032 #ifdef KMP_GOMP_COMPAT
1033 
1034 template <typename UT>
1035 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1036  typedef typename traits_t<UT>::signed_t ST;
1037  kmp_info_t *th = __kmp_threads[gtid];
1038 
1039  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1040  if (!th->th.th_team->t.t_serialized) {
1041  // int cid;
1042  dispatch_private_info_template<UT> *pr =
1043  reinterpret_cast<dispatch_private_info_template<UT> *>(
1044  th->th.th_dispatch->th_dispatch_pr_current);
1045  dispatch_shared_info_template<UT> volatile *sh =
1046  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1047  th->th.th_dispatch->th_dispatch_sh_current);
1048  KMP_DEBUG_ASSERT(pr);
1049  KMP_DEBUG_ASSERT(sh);
1050  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1051  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1052 
1053  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1054  UT lower = pr->u.p.ordered_lower;
1055  UT upper = pr->u.p.ordered_upper;
1056  UT inc = upper - lower + 1;
1057 
1058  if (pr->ordered_bumped == inc) {
1059  KD_TRACE(
1060  1000,
1061  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1062  gtid));
1063  pr->ordered_bumped = 0;
1064  } else {
1065  inc -= pr->ordered_bumped;
1066 
1067 #ifdef KMP_DEBUG
1068  {
1069  char *buff;
1070  // create format specifiers before the debug output
1071  buff = __kmp_str_format(
1072  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1073  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1074  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1075  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1076  __kmp_str_free(&buff);
1077  }
1078 #endif
1079 
1080  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1081  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1082 
1083  KMP_MB(); /* is this necessary? */
1084  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1085  "ordered_bumped to zero\n",
1086  gtid));
1087  pr->ordered_bumped = 0;
1089 #ifdef KMP_DEBUG
1090  {
1091  char *buff;
1092  // create format specifiers before the debug output
1093  buff = __kmp_str_format(
1094  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1095  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1096  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1097  traits_t<UT>::spec);
1098  KD_TRACE(1000,
1099  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1100  __kmp_str_free(&buff);
1101  }
1102 #endif
1103 
1104  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1105  }
1106  // }
1107  }
1108  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1109 }
1110 
1111 #endif /* KMP_GOMP_COMPAT */
1112 
1113 template <typename T>
1114 int __kmp_dispatch_next_algorithm(int gtid,
1115  dispatch_private_info_template<T> *pr,
1116  dispatch_shared_info_template<T> volatile *sh,
1117  kmp_int32 *p_last, T *p_lb, T *p_ub,
1118  typename traits_t<T>::signed_t *p_st, T nproc,
1119  T tid) {
1120  typedef typename traits_t<T>::unsigned_t UT;
1121  typedef typename traits_t<T>::signed_t ST;
1122  typedef typename traits_t<T>::floating_t DBL;
1123  int status = 0;
1124  kmp_int32 last = 0;
1125  T start;
1126  ST incr;
1127  UT limit, trip, init;
1128  kmp_info_t *th = __kmp_threads[gtid];
1129  kmp_team_t *team = th->th.th_team;
1130 
1131  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1132  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1133  KMP_DEBUG_ASSERT(pr);
1134  KMP_DEBUG_ASSERT(sh);
1135  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1136 #ifdef KMP_DEBUG
1137  {
1138  char *buff;
1139  // create format specifiers before the debug output
1140  buff =
1141  __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1142  "sh:%%p nproc:%%%s tid:%%%s\n",
1143  traits_t<T>::spec, traits_t<T>::spec);
1144  KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1145  __kmp_str_free(&buff);
1146  }
1147 #endif
1148 
1149  // zero trip count
1150  if (pr->u.p.tc == 0) {
1151  KD_TRACE(10,
1152  ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1153  "zero status:%d\n",
1154  gtid, status));
1155  return 0;
1156  }
1157 
1158  switch (pr->schedule) {
1159 #if (KMP_STATIC_STEAL_ENABLED)
1160  case kmp_sch_static_steal: {
1161  T chunk = pr->u.p.parm1;
1162 
1163  KD_TRACE(100,
1164  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1165  gtid));
1166 
1167  trip = pr->u.p.tc - 1;
1168 
1169  if (traits_t<T>::type_size > 4) {
1170  // use lock for 8-byte and CAS for 4-byte induction
1171  // variable. TODO (optional): check and use 16-byte CAS
1172  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1173  KMP_DEBUG_ASSERT(lck != NULL);
1174  if (pr->u.p.count < (UT)pr->u.p.ub) {
1175  __kmp_acquire_lock(lck, gtid);
1176  // try to get own chunk of iterations
1177  init = (pr->u.p.count)++;
1178  status = (init < (UT)pr->u.p.ub);
1179  __kmp_release_lock(lck, gtid);
1180  } else {
1181  status = 0; // no own chunks
1182  }
1183  if (!status) { // try to steal
1184  kmp_info_t **other_threads = team->t.t_threads;
1185  int while_limit = nproc; // nproc attempts to find a victim
1186  int while_index = 0;
1187  // TODO: algorithm of searching for a victim
1188  // should be cleaned up and measured
1189  while ((!status) && (while_limit != ++while_index)) {
1190  T remaining;
1191  T victimIdx = pr->u.p.parm4;
1192  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1193  dispatch_private_info_template<T> *victim =
1194  reinterpret_cast<dispatch_private_info_template<T> *>(
1195  other_threads[victimIdx]
1196  ->th.th_dispatch->th_dispatch_pr_current);
1197  while ((victim == NULL || victim == pr ||
1198  (*(volatile T *)&victim->u.p.static_steal_counter !=
1199  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1200  oldVictimIdx != victimIdx) {
1201  victimIdx = (victimIdx + 1) % nproc;
1202  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1203  other_threads[victimIdx]
1204  ->th.th_dispatch->th_dispatch_pr_current);
1205  }
1206  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1207  *(volatile T *)&pr->u.p.static_steal_counter)) {
1208  continue; // try once more (nproc attempts in total)
1209  // no victim is ready yet to participate in stealing
1210  // because all victims are still in kmp_init_dispatch
1211  }
1212  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1213  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1214  continue; // not enough chunks to steal, goto next victim
1215  }
1216 
1217  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1218  KMP_ASSERT(lck != NULL);
1219  __kmp_acquire_lock(lck, gtid);
1220  limit = victim->u.p.ub; // keep initial ub
1221  if (victim->u.p.count >= limit ||
1222  (remaining = limit - victim->u.p.count) < 2) {
1223  __kmp_release_lock(lck, gtid);
1224  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1225  continue; // not enough chunks to steal
1226  }
1227  // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1228  // by 1
1229  if (remaining > 3) {
1230  // steal 1/4 of remaining
1231  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1232  init = (victim->u.p.ub -= (remaining >> 2));
1233  } else {
1234  // steal 1 chunk of 2 or 3 remaining
1235  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1236  init = (victim->u.p.ub -= 1);
1237  }
1238  __kmp_release_lock(lck, gtid);
1239 
1240  KMP_DEBUG_ASSERT(init + 1 <= limit);
1241  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1242  status = 1;
1243  while_index = 0;
1244  // now update own count and ub with stolen range but init chunk
1245  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1246  pr->u.p.count = init + 1;
1247  pr->u.p.ub = limit;
1248  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1249  } // while (search for victim)
1250  } // if (try to find victim and steal)
1251  } else {
1252  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1253  typedef union {
1254  struct {
1255  UT count;
1256  T ub;
1257  } p;
1258  kmp_int64 b;
1259  } union_i4;
1260  // All operations on 'count' or 'ub' must be combined atomically
1261  // together.
1262  {
1263  union_i4 vold, vnew;
1264  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1265  vnew = vold;
1266  vnew.p.count++;
1267  while (!KMP_COMPARE_AND_STORE_ACQ64(
1268  (volatile kmp_int64 *)&pr->u.p.count,
1269  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1270  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1271  KMP_CPU_PAUSE();
1272  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1273  vnew = vold;
1274  vnew.p.count++;
1275  }
1276  vnew = vold;
1277  init = vnew.p.count;
1278  status = (init < (UT)vnew.p.ub);
1279  }
1280 
1281  if (!status) {
1282  kmp_info_t **other_threads = team->t.t_threads;
1283  int while_limit = nproc; // nproc attempts to find a victim
1284  int while_index = 0;
1285 
1286  // TODO: algorithm of searching for a victim
1287  // should be cleaned up and measured
1288  while ((!status) && (while_limit != ++while_index)) {
1289  union_i4 vold, vnew;
1290  kmp_int32 remaining;
1291  T victimIdx = pr->u.p.parm4;
1292  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1293  dispatch_private_info_template<T> *victim =
1294  reinterpret_cast<dispatch_private_info_template<T> *>(
1295  other_threads[victimIdx]
1296  ->th.th_dispatch->th_dispatch_pr_current);
1297  while ((victim == NULL || victim == pr ||
1298  (*(volatile T *)&victim->u.p.static_steal_counter !=
1299  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1300  oldVictimIdx != victimIdx) {
1301  victimIdx = (victimIdx + 1) % nproc;
1302  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1303  other_threads[victimIdx]
1304  ->th.th_dispatch->th_dispatch_pr_current);
1305  }
1306  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1307  *(volatile T *)&pr->u.p.static_steal_counter)) {
1308  continue; // try once more (nproc attempts in total)
1309  // no victim is ready yet to participate in stealing
1310  // because all victims are still in kmp_init_dispatch
1311  }
1312  pr->u.p.parm4 = victimIdx; // new victim found
1313  while (1) { // CAS loop if victim has enough chunks to steal
1314  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1315  vnew = vold;
1316 
1317  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1318  if (vnew.p.count >= (UT)vnew.p.ub ||
1319  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1320  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1321  break; // not enough chunks to steal, goto next victim
1322  }
1323  if (remaining > 3) {
1324  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1325  } else {
1326  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1327  }
1328  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1329  // TODO: Should this be acquire or release?
1330  if (KMP_COMPARE_AND_STORE_ACQ64(
1331  (volatile kmp_int64 *)&victim->u.p.count,
1332  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1333  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1334  // stealing succedded
1335  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1336  vold.p.ub - vnew.p.ub);
1337  status = 1;
1338  while_index = 0;
1339  // now update own count and ub
1340  init = vnew.p.ub;
1341  vold.p.count = init + 1;
1342 #if KMP_ARCH_X86
1343  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1344 #else
1345  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1346 #endif
1347  break;
1348  } // if (check CAS result)
1349  KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1350  } // while (try to steal from particular victim)
1351  } // while (search for victim)
1352  } // if (try to find victim and steal)
1353  } // if (4-byte induction variable)
1354  if (!status) {
1355  *p_lb = 0;
1356  *p_ub = 0;
1357  if (p_st != NULL)
1358  *p_st = 0;
1359  } else {
1360  start = pr->u.p.parm2;
1361  init *= chunk;
1362  limit = chunk + init - 1;
1363  incr = pr->u.p.st;
1364  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1365 
1366  KMP_DEBUG_ASSERT(init <= trip);
1367  if ((last = (limit >= trip)) != 0)
1368  limit = trip;
1369  if (p_st != NULL)
1370  *p_st = incr;
1371 
1372  if (incr == 1) {
1373  *p_lb = start + init;
1374  *p_ub = start + limit;
1375  } else {
1376  *p_lb = start + init * incr;
1377  *p_ub = start + limit * incr;
1378  }
1379 
1380  if (pr->flags.ordered) {
1381  pr->u.p.ordered_lower = init;
1382  pr->u.p.ordered_upper = limit;
1383  } // if
1384  } // if
1385  break;
1386  } // case
1387 #endif // ( KMP_STATIC_STEAL_ENABLED )
1388  case kmp_sch_static_balanced: {
1389  KD_TRACE(
1390  10,
1391  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1392  gtid));
1393  /* check if thread has any iteration to do */
1394  if ((status = !pr->u.p.count) != 0) {
1395  pr->u.p.count = 1;
1396  *p_lb = pr->u.p.lb;
1397  *p_ub = pr->u.p.ub;
1398  last = pr->u.p.parm1;
1399  if (p_st != NULL)
1400  *p_st = pr->u.p.st;
1401  } else { /* no iterations to do */
1402  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1403  }
1404  } // case
1405  break;
1406  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1407  merged here */
1408  case kmp_sch_static_chunked: {
1409  T parm1;
1410 
1411  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1412  "kmp_sch_static_[affinity|chunked] case\n",
1413  gtid));
1414  parm1 = pr->u.p.parm1;
1415 
1416  trip = pr->u.p.tc - 1;
1417  init = parm1 * (pr->u.p.count + tid);
1418 
1419  if ((status = (init <= trip)) != 0) {
1420  start = pr->u.p.lb;
1421  incr = pr->u.p.st;
1422  limit = parm1 + init - 1;
1423 
1424  if ((last = (limit >= trip)) != 0)
1425  limit = trip;
1426 
1427  if (p_st != NULL)
1428  *p_st = incr;
1429 
1430  pr->u.p.count += nproc;
1431 
1432  if (incr == 1) {
1433  *p_lb = start + init;
1434  *p_ub = start + limit;
1435  } else {
1436  *p_lb = start + init * incr;
1437  *p_ub = start + limit * incr;
1438  }
1439 
1440  if (pr->flags.ordered) {
1441  pr->u.p.ordered_lower = init;
1442  pr->u.p.ordered_upper = limit;
1443  } // if
1444  } // if
1445  } // case
1446  break;
1447 
1448  case kmp_sch_dynamic_chunked: {
1449  T chunk = pr->u.p.parm1;
1450 
1451  KD_TRACE(
1452  100,
1453  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1454  gtid));
1455 
1456  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1457  trip = pr->u.p.tc - 1;
1458 
1459  if ((status = (init <= trip)) == 0) {
1460  *p_lb = 0;
1461  *p_ub = 0;
1462  if (p_st != NULL)
1463  *p_st = 0;
1464  } else {
1465  start = pr->u.p.lb;
1466  limit = chunk + init - 1;
1467  incr = pr->u.p.st;
1468 
1469  if ((last = (limit >= trip)) != 0)
1470  limit = trip;
1471 
1472  if (p_st != NULL)
1473  *p_st = incr;
1474 
1475  if (incr == 1) {
1476  *p_lb = start + init;
1477  *p_ub = start + limit;
1478  } else {
1479  *p_lb = start + init * incr;
1480  *p_ub = start + limit * incr;
1481  }
1482 
1483  if (pr->flags.ordered) {
1484  pr->u.p.ordered_lower = init;
1485  pr->u.p.ordered_upper = limit;
1486  } // if
1487  } // if
1488  } // case
1489  break;
1490 
1491  case kmp_sch_guided_iterative_chunked: {
1492  T chunkspec = pr->u.p.parm1;
1493  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1494  "iterative case\n",
1495  gtid));
1496  trip = pr->u.p.tc;
1497  // Start atomic part of calculations
1498  while (1) {
1499  ST remaining; // signed, because can be < 0
1500  init = sh->u.s.iteration; // shared value
1501  remaining = trip - init;
1502  if (remaining <= 0) { // AC: need to compare with 0 first
1503  // nothing to do, don't try atomic op
1504  status = 0;
1505  break;
1506  }
1507  if ((T)remaining <
1508  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1509  // use dynamic-style shcedule
1510  // atomically inrement iterations, get old value
1511  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1512  (ST)chunkspec);
1513  remaining = trip - init;
1514  if (remaining <= 0) {
1515  status = 0; // all iterations got by other threads
1516  } else {
1517  // got some iterations to work on
1518  status = 1;
1519  if ((T)remaining > chunkspec) {
1520  limit = init + chunkspec - 1;
1521  } else {
1522  last = 1; // the last chunk
1523  limit = init + remaining - 1;
1524  } // if
1525  } // if
1526  break;
1527  } // if
1528  limit = init +
1529  (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1530  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1531  (ST)init, (ST)limit)) {
1532  // CAS was successful, chunk obtained
1533  status = 1;
1534  --limit;
1535  break;
1536  } // if
1537  } // while
1538  if (status != 0) {
1539  start = pr->u.p.lb;
1540  incr = pr->u.p.st;
1541  if (p_st != NULL)
1542  *p_st = incr;
1543  *p_lb = start + init * incr;
1544  *p_ub = start + limit * incr;
1545  if (pr->flags.ordered) {
1546  pr->u.p.ordered_lower = init;
1547  pr->u.p.ordered_upper = limit;
1548  } // if
1549  } else {
1550  *p_lb = 0;
1551  *p_ub = 0;
1552  if (p_st != NULL)
1553  *p_st = 0;
1554  } // if
1555  } // case
1556  break;
1557 
1558 #if OMP_45_ENABLED
1559  case kmp_sch_guided_simd: {
1560  // same as iterative but curr-chunk adjusted to be multiple of given
1561  // chunk
1562  T chunk = pr->u.p.parm1;
1563  KD_TRACE(100,
1564  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1565  gtid));
1566  trip = pr->u.p.tc;
1567  // Start atomic part of calculations
1568  while (1) {
1569  ST remaining; // signed, because can be < 0
1570  init = sh->u.s.iteration; // shared value
1571  remaining = trip - init;
1572  if (remaining <= 0) { // AC: need to compare with 0 first
1573  status = 0; // nothing to do, don't try atomic op
1574  break;
1575  }
1576  KMP_DEBUG_ASSERT(init % chunk == 0);
1577  // compare with K*nproc*(chunk+1), K=2 by default
1578  if ((T)remaining < pr->u.p.parm2) {
1579  // use dynamic-style shcedule
1580  // atomically inrement iterations, get old value
1581  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1582  (ST)chunk);
1583  remaining = trip - init;
1584  if (remaining <= 0) {
1585  status = 0; // all iterations got by other threads
1586  } else {
1587  // got some iterations to work on
1588  status = 1;
1589  if ((T)remaining > chunk) {
1590  limit = init + chunk - 1;
1591  } else {
1592  last = 1; // the last chunk
1593  limit = init + remaining - 1;
1594  } // if
1595  } // if
1596  break;
1597  } // if
1598  // divide by K*nproc
1599  UT span = remaining * (*(double *)&pr->u.p.parm3);
1600  UT rem = span % chunk;
1601  if (rem) // adjust so that span%chunk == 0
1602  span += chunk - rem;
1603  limit = init + span;
1604  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1605  (ST)init, (ST)limit)) {
1606  // CAS was successful, chunk obtained
1607  status = 1;
1608  --limit;
1609  break;
1610  } // if
1611  } // while
1612  if (status != 0) {
1613  start = pr->u.p.lb;
1614  incr = pr->u.p.st;
1615  if (p_st != NULL)
1616  *p_st = incr;
1617  *p_lb = start + init * incr;
1618  *p_ub = start + limit * incr;
1619  if (pr->flags.ordered) {
1620  pr->u.p.ordered_lower = init;
1621  pr->u.p.ordered_upper = limit;
1622  } // if
1623  } else {
1624  *p_lb = 0;
1625  *p_ub = 0;
1626  if (p_st != NULL)
1627  *p_st = 0;
1628  } // if
1629  } // case
1630  break;
1631 #endif // OMP_45_ENABLED
1632 
1633  case kmp_sch_guided_analytical_chunked: {
1634  T chunkspec = pr->u.p.parm1;
1635  UT chunkIdx;
1636 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1637  /* for storing original FPCW value for Windows* OS on
1638  IA-32 architecture 8-byte version */
1639  unsigned int oldFpcw;
1640  unsigned int fpcwSet = 0;
1641 #endif
1642  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1643  "kmp_sch_guided_analytical_chunked case\n",
1644  gtid));
1645 
1646  trip = pr->u.p.tc;
1647 
1648  KMP_DEBUG_ASSERT(nproc > 1);
1649  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1650 
1651  while (1) { /* this while loop is a safeguard against unexpected zero
1652  chunk sizes */
1653  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1654  if (chunkIdx >= (UT)pr->u.p.parm2) {
1655  --trip;
1656  /* use dynamic-style scheduling */
1657  init = chunkIdx * chunkspec + pr->u.p.count;
1658  /* need to verify init > 0 in case of overflow in the above
1659  * calculation */
1660  if ((status = (init > 0 && init <= trip)) != 0) {
1661  limit = init + chunkspec - 1;
1662 
1663  if ((last = (limit >= trip)) != 0)
1664  limit = trip;
1665  }
1666  break;
1667  } else {
1668 /* use exponential-style scheduling */
1669 /* The following check is to workaround the lack of long double precision on
1670  Windows* OS.
1671  This check works around the possible effect that init != 0 for chunkIdx == 0.
1672  */
1673 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1674  /* If we haven't already done so, save original
1675  FPCW and set precision to 64-bit, as Windows* OS
1676  on IA-32 architecture defaults to 53-bit */
1677  if (!fpcwSet) {
1678  oldFpcw = _control87(0, 0);
1679  _control87(_PC_64, _MCW_PC);
1680  fpcwSet = 0x30000;
1681  }
1682 #endif
1683  if (chunkIdx) {
1684  init = __kmp_dispatch_guided_remaining<T>(
1685  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1686  KMP_DEBUG_ASSERT(init);
1687  init = trip - init;
1688  } else
1689  init = 0;
1690  limit = trip - __kmp_dispatch_guided_remaining<T>(
1691  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1692  KMP_ASSERT(init <= limit);
1693  if (init < limit) {
1694  KMP_DEBUG_ASSERT(limit <= trip);
1695  --limit;
1696  status = 1;
1697  break;
1698  } // if
1699  } // if
1700  } // while (1)
1701 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1702  /* restore FPCW if necessary
1703  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1704  */
1705  if (fpcwSet && (oldFpcw & fpcwSet))
1706  _control87(oldFpcw, _MCW_PC);
1707 #endif
1708  if (status != 0) {
1709  start = pr->u.p.lb;
1710  incr = pr->u.p.st;
1711  if (p_st != NULL)
1712  *p_st = incr;
1713  *p_lb = start + init * incr;
1714  *p_ub = start + limit * incr;
1715  if (pr->flags.ordered) {
1716  pr->u.p.ordered_lower = init;
1717  pr->u.p.ordered_upper = limit;
1718  }
1719  } else {
1720  *p_lb = 0;
1721  *p_ub = 0;
1722  if (p_st != NULL)
1723  *p_st = 0;
1724  }
1725  } // case
1726  break;
1727 
1728  case kmp_sch_trapezoidal: {
1729  UT index;
1730  T parm2 = pr->u.p.parm2;
1731  T parm3 = pr->u.p.parm3;
1732  T parm4 = pr->u.p.parm4;
1733  KD_TRACE(100,
1734  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1735  gtid));
1736 
1737  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1738 
1739  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1740  trip = pr->u.p.tc - 1;
1741 
1742  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1743  *p_lb = 0;
1744  *p_ub = 0;
1745  if (p_st != NULL)
1746  *p_st = 0;
1747  } else {
1748  start = pr->u.p.lb;
1749  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1750  incr = pr->u.p.st;
1751 
1752  if ((last = (limit >= trip)) != 0)
1753  limit = trip;
1754 
1755  if (p_st != NULL)
1756  *p_st = incr;
1757 
1758  if (incr == 1) {
1759  *p_lb = start + init;
1760  *p_ub = start + limit;
1761  } else {
1762  *p_lb = start + init * incr;
1763  *p_ub = start + limit * incr;
1764  }
1765 
1766  if (pr->flags.ordered) {
1767  pr->u.p.ordered_lower = init;
1768  pr->u.p.ordered_upper = limit;
1769  } // if
1770  } // if
1771  } // case
1772  break;
1773  default: {
1774  status = 0; // to avoid complaints on uninitialized variable use
1775  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1776  KMP_HNT(GetNewerLibrary), // Hint
1777  __kmp_msg_null // Variadic argument list terminator
1778  );
1779  } break;
1780  } // switch
1781  if (p_last)
1782  *p_last = last;
1783 #ifdef KMP_DEBUG
1784  if (pr->flags.ordered) {
1785  char *buff;
1786  // create format specifiers before the debug output
1787  buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1788  "ordered_lower:%%%s ordered_upper:%%%s\n",
1789  traits_t<UT>::spec, traits_t<UT>::spec);
1790  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1791  __kmp_str_free(&buff);
1792  }
1793  {
1794  char *buff;
1795  // create format specifiers before the debug output
1796  buff = __kmp_str_format(
1797  "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1798  "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1799  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1800  KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1801  __kmp_str_free(&buff);
1802  }
1803 #endif
1804  return status;
1805 }
1806 
1807 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1808  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1809  is not called. */
1810 #if OMPT_SUPPORT && OMPT_OPTIONAL
1811 #define OMPT_LOOP_END \
1812  if (status == 0) { \
1813  if (ompt_enabled.ompt_callback_work) { \
1814  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1815  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1816  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1817  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1818  &(task_info->task_data), 0, codeptr); \
1819  } \
1820  }
1821 // TODO: implement count
1822 #else
1823 #define OMPT_LOOP_END // no-op
1824 #endif
1825 
1826 #if KMP_STATS_ENABLED
1827 #define KMP_STATS_LOOP_END \
1828  { \
1829  kmp_int64 u, l, t, i; \
1830  l = (kmp_int64)(*p_lb); \
1831  u = (kmp_int64)(*p_ub); \
1832  i = (kmp_int64)(pr->u.p.st); \
1833  if (status == 0) { \
1834  t = 0; \
1835  KMP_POP_PARTITIONED_TIMER(); \
1836  } else if (i == 1) { \
1837  if (u >= l) \
1838  t = u - l + 1; \
1839  else \
1840  t = 0; \
1841  } else if (i < 0) { \
1842  if (l >= u) \
1843  t = (l - u) / (-i) + 1; \
1844  else \
1845  t = 0; \
1846  } else { \
1847  if (u >= l) \
1848  t = (u - l) / i + 1; \
1849  else \
1850  t = 0; \
1851  } \
1852  KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1853  }
1854 #else
1855 #define KMP_STATS_LOOP_END /* Nothing */
1856 #endif
1857 
1858 template <typename T>
1859 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1860  T *p_lb, T *p_ub,
1861  typename traits_t<T>::signed_t *p_st
1862 #if OMPT_SUPPORT && OMPT_OPTIONAL
1863  ,
1864  void *codeptr
1865 #endif
1866  ) {
1867 
1868  typedef typename traits_t<T>::unsigned_t UT;
1869  typedef typename traits_t<T>::signed_t ST;
1870  typedef typename traits_t<T>::floating_t DBL;
1871  // This is potentially slightly misleading, schedule(runtime) will appear here
1872  // even if the actual runtme schedule is static. (Which points out a
1873  // disadavantage of schedule(runtime): even when static scheduling is used it
1874  // costs more than a compile time choice to use static scheduling would.)
1875  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1876 
1877  int status;
1878  dispatch_private_info_template<T> *pr;
1879  kmp_info_t *th = __kmp_threads[gtid];
1880  kmp_team_t *team = th->th.th_team;
1881 
1882  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1883  KD_TRACE(
1884  1000,
1885  ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1886  gtid, p_lb, p_ub, p_st, p_last));
1887 
1888  if (team->t.t_serialized) {
1889  /* NOTE: serialize this dispatch becase we are not at the active level */
1890  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1891  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1892  KMP_DEBUG_ASSERT(pr);
1893 
1894  if ((status = (pr->u.p.tc != 0)) == 0) {
1895  *p_lb = 0;
1896  *p_ub = 0;
1897  // if ( p_last != NULL )
1898  // *p_last = 0;
1899  if (p_st != NULL)
1900  *p_st = 0;
1901  if (__kmp_env_consistency_check) {
1902  if (pr->pushed_ws != ct_none) {
1903  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1904  }
1905  }
1906  } else if (pr->flags.nomerge) {
1907  kmp_int32 last;
1908  T start;
1909  UT limit, trip, init;
1910  ST incr;
1911  T chunk = pr->u.p.parm1;
1912 
1913  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1914  gtid));
1915 
1916  init = chunk * pr->u.p.count++;
1917  trip = pr->u.p.tc - 1;
1918 
1919  if ((status = (init <= trip)) == 0) {
1920  *p_lb = 0;
1921  *p_ub = 0;
1922  // if ( p_last != NULL )
1923  // *p_last = 0;
1924  if (p_st != NULL)
1925  *p_st = 0;
1926  if (__kmp_env_consistency_check) {
1927  if (pr->pushed_ws != ct_none) {
1928  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1929  }
1930  }
1931  } else {
1932  start = pr->u.p.lb;
1933  limit = chunk + init - 1;
1934  incr = pr->u.p.st;
1935 
1936  if ((last = (limit >= trip)) != 0) {
1937  limit = trip;
1938 #if KMP_OS_WINDOWS
1939  pr->u.p.last_upper = pr->u.p.ub;
1940 #endif /* KMP_OS_WINDOWS */
1941  }
1942  if (p_last != NULL)
1943  *p_last = last;
1944  if (p_st != NULL)
1945  *p_st = incr;
1946  if (incr == 1) {
1947  *p_lb = start + init;
1948  *p_ub = start + limit;
1949  } else {
1950  *p_lb = start + init * incr;
1951  *p_ub = start + limit * incr;
1952  }
1953 
1954  if (pr->flags.ordered) {
1955  pr->u.p.ordered_lower = init;
1956  pr->u.p.ordered_upper = limit;
1957 #ifdef KMP_DEBUG
1958  {
1959  char *buff;
1960  // create format specifiers before the debug output
1961  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1962  "ordered_lower:%%%s ordered_upper:%%%s\n",
1963  traits_t<UT>::spec, traits_t<UT>::spec);
1964  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1965  pr->u.p.ordered_upper));
1966  __kmp_str_free(&buff);
1967  }
1968 #endif
1969  } // if
1970  } // if
1971  } else {
1972  pr->u.p.tc = 0;
1973  *p_lb = pr->u.p.lb;
1974  *p_ub = pr->u.p.ub;
1975 #if KMP_OS_WINDOWS
1976  pr->u.p.last_upper = *p_ub;
1977 #endif /* KMP_OS_WINDOWS */
1978  if (p_last != NULL)
1979  *p_last = TRUE;
1980  if (p_st != NULL)
1981  *p_st = pr->u.p.st;
1982  } // if
1983 #ifdef KMP_DEBUG
1984  {
1985  char *buff;
1986  // create format specifiers before the debug output
1987  buff = __kmp_str_format(
1988  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1989  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1990  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1991  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1992  __kmp_str_free(&buff);
1993  }
1994 #endif
1995 #if INCLUDE_SSC_MARKS
1996  SSC_MARK_DISPATCH_NEXT();
1997 #endif
1998  OMPT_LOOP_END;
1999  KMP_STATS_LOOP_END;
2000  return status;
2001  } else {
2002  kmp_int32 last = 0;
2003  dispatch_shared_info_template<T> volatile *sh;
2004 
2005  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2006  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2007 
2008  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2009  th->th.th_dispatch->th_dispatch_pr_current);
2010  KMP_DEBUG_ASSERT(pr);
2011  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2012  th->th.th_dispatch->th_dispatch_sh_current);
2013  KMP_DEBUG_ASSERT(sh);
2014 
2015 #if KMP_USE_HIER_SCHED
2016  if (pr->flags.use_hier)
2017  status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2018  else
2019 #endif // KMP_USE_HIER_SCHED
2020  status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2021  p_st, th->th.th_team_nproc,
2022  th->th.th_info.ds.ds_tid);
2023  // status == 0: no more iterations to execute
2024  if (status == 0) {
2025  UT num_done;
2026 
2027  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2028 #ifdef KMP_DEBUG
2029  {
2030  char *buff;
2031  // create format specifiers before the debug output
2032  buff = __kmp_str_format(
2033  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2034  traits_t<UT>::spec);
2035  KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2036  __kmp_str_free(&buff);
2037  }
2038 #endif
2039 
2040 #if KMP_USE_HIER_SCHED
2041  pr->flags.use_hier = FALSE;
2042 #endif
2043  if ((ST)num_done == th->th.th_team_nproc - 1) {
2044 #if (KMP_STATIC_STEAL_ENABLED)
2045  if (pr->schedule == kmp_sch_static_steal &&
2046  traits_t<T>::type_size > 4) {
2047  int i;
2048  kmp_info_t **other_threads = team->t.t_threads;
2049  // loop complete, safe to destroy locks used for stealing
2050  for (i = 0; i < th->th.th_team_nproc; ++i) {
2051  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2052  KMP_ASSERT(lck != NULL);
2053  __kmp_destroy_lock(lck);
2054  __kmp_free(lck);
2055  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2056  }
2057  }
2058 #endif
2059  /* NOTE: release this buffer to be reused */
2060 
2061  KMP_MB(); /* Flush all pending memory write invalidates. */
2062 
2063  sh->u.s.num_done = 0;
2064  sh->u.s.iteration = 0;
2065 
2066  /* TODO replace with general release procedure? */
2067  if (pr->flags.ordered) {
2068  sh->u.s.ordered_iteration = 0;
2069  }
2070 
2071  KMP_MB(); /* Flush all pending memory write invalidates. */
2072 
2073  sh->buffer_index += __kmp_dispatch_num_buffers;
2074  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2075  gtid, sh->buffer_index));
2076 
2077  KMP_MB(); /* Flush all pending memory write invalidates. */
2078 
2079  } // if
2080  if (__kmp_env_consistency_check) {
2081  if (pr->pushed_ws != ct_none) {
2082  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2083  }
2084  }
2085 
2086  th->th.th_dispatch->th_deo_fcn = NULL;
2087  th->th.th_dispatch->th_dxo_fcn = NULL;
2088  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2089  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2090  } // if (status == 0)
2091 #if KMP_OS_WINDOWS
2092  else if (last) {
2093  pr->u.p.last_upper = pr->u.p.ub;
2094  }
2095 #endif /* KMP_OS_WINDOWS */
2096  if (p_last != NULL && status != 0)
2097  *p_last = last;
2098  } // if
2099 
2100 #ifdef KMP_DEBUG
2101  {
2102  char *buff;
2103  // create format specifiers before the debug output
2104  buff = __kmp_str_format(
2105  "__kmp_dispatch_next: T#%%d normal case: "
2106  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2107  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2108  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2109  (p_last ? *p_last : 0), status));
2110  __kmp_str_free(&buff);
2111  }
2112 #endif
2113 #if INCLUDE_SSC_MARKS
2114  SSC_MARK_DISPATCH_NEXT();
2115 #endif
2116  OMPT_LOOP_END;
2117  KMP_STATS_LOOP_END;
2118  return status;
2119 }
2120 
2121 template <typename T>
2122 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2123  kmp_int32 *plastiter, T *plower, T *pupper,
2124  typename traits_t<T>::signed_t incr) {
2125  typedef typename traits_t<T>::unsigned_t UT;
2126  typedef typename traits_t<T>::signed_t ST;
2127  kmp_uint32 team_id;
2128  kmp_uint32 nteams;
2129  UT trip_count;
2130  kmp_team_t *team;
2131  kmp_info_t *th;
2132 
2133  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2134  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2135 #ifdef KMP_DEBUG
2136  {
2137  char *buff;
2138  // create format specifiers before the debug output
2139  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2140  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2141  traits_t<T>::spec, traits_t<T>::spec,
2142  traits_t<ST>::spec, traits_t<T>::spec);
2143  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2144  __kmp_str_free(&buff);
2145  }
2146 #endif
2147 
2148  if (__kmp_env_consistency_check) {
2149  if (incr == 0) {
2150  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2151  loc);
2152  }
2153  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2154  // The loop is illegal.
2155  // Some zero-trip loops maintained by compiler, e.g.:
2156  // for(i=10;i<0;++i) // lower >= upper - run-time check
2157  // for(i=0;i>10;--i) // lower <= upper - run-time check
2158  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2159  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2160  // Compiler does not check the following illegal loops:
2161  // for(i=0;i<10;i+=incr) // where incr<0
2162  // for(i=10;i>0;i-=incr) // where incr<0
2163  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2164  }
2165  }
2166  th = __kmp_threads[gtid];
2167  team = th->th.th_team;
2168 #if OMP_40_ENABLED
2169  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2170  nteams = th->th.th_teams_size.nteams;
2171 #endif
2172  team_id = team->t.t_master_tid;
2173  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2174 
2175  // compute global trip count
2176  if (incr == 1) {
2177  trip_count = *pupper - *plower + 1;
2178  } else if (incr == -1) {
2179  trip_count = *plower - *pupper + 1;
2180  } else if (incr > 0) {
2181  // upper-lower can exceed the limit of signed type
2182  trip_count = (UT)(*pupper - *plower) / incr + 1;
2183  } else {
2184  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2185  }
2186 
2187  if (trip_count <= nteams) {
2188  KMP_DEBUG_ASSERT(
2189  __kmp_static == kmp_sch_static_greedy ||
2190  __kmp_static ==
2191  kmp_sch_static_balanced); // Unknown static scheduling type.
2192  // only some teams get single iteration, others get nothing
2193  if (team_id < trip_count) {
2194  *pupper = *plower = *plower + team_id * incr;
2195  } else {
2196  *plower = *pupper + incr; // zero-trip loop
2197  }
2198  if (plastiter != NULL)
2199  *plastiter = (team_id == trip_count - 1);
2200  } else {
2201  if (__kmp_static == kmp_sch_static_balanced) {
2202  UT chunk = trip_count / nteams;
2203  UT extras = trip_count % nteams;
2204  *plower +=
2205  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2206  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2207  if (plastiter != NULL)
2208  *plastiter = (team_id == nteams - 1);
2209  } else {
2210  T chunk_inc_count =
2211  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2212  T upper = *pupper;
2213  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2214  // Unknown static scheduling type.
2215  *plower += team_id * chunk_inc_count;
2216  *pupper = *plower + chunk_inc_count - incr;
2217  // Check/correct bounds if needed
2218  if (incr > 0) {
2219  if (*pupper < *plower)
2220  *pupper = traits_t<T>::max_value;
2221  if (plastiter != NULL)
2222  *plastiter = *plower <= upper && *pupper > upper - incr;
2223  if (*pupper > upper)
2224  *pupper = upper; // tracker C73258
2225  } else {
2226  if (*pupper > *plower)
2227  *pupper = traits_t<T>::min_value;
2228  if (plastiter != NULL)
2229  *plastiter = *plower >= upper && *pupper < upper - incr;
2230  if (*pupper < upper)
2231  *pupper = upper; // tracker C73258
2232  }
2233  }
2234  }
2235 }
2236 
2237 //-----------------------------------------------------------------------------
2238 // Dispatch routines
2239 // Transfer call to template< type T >
2240 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2241 // T lb, T ub, ST st, ST chunk )
2242 extern "C" {
2243 
2260 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2261  enum sched_type schedule, kmp_int32 lb,
2262  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2263  KMP_DEBUG_ASSERT(__kmp_init_serial);
2264 #if OMPT_SUPPORT && OMPT_OPTIONAL
2265  OMPT_STORE_RETURN_ADDRESS(gtid);
2266 #endif
2267  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2268 }
2272 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2273  enum sched_type schedule, kmp_uint32 lb,
2274  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2275  KMP_DEBUG_ASSERT(__kmp_init_serial);
2276 #if OMPT_SUPPORT && OMPT_OPTIONAL
2277  OMPT_STORE_RETURN_ADDRESS(gtid);
2278 #endif
2279  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2280 }
2281 
2285 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2286  enum sched_type schedule, kmp_int64 lb,
2287  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2288  KMP_DEBUG_ASSERT(__kmp_init_serial);
2289 #if OMPT_SUPPORT && OMPT_OPTIONAL
2290  OMPT_STORE_RETURN_ADDRESS(gtid);
2291 #endif
2292  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2293 }
2294 
2298 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2299  enum sched_type schedule, kmp_uint64 lb,
2300  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2301  KMP_DEBUG_ASSERT(__kmp_init_serial);
2302 #if OMPT_SUPPORT && OMPT_OPTIONAL
2303  OMPT_STORE_RETURN_ADDRESS(gtid);
2304 #endif
2305  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2306 }
2307 
2317 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2318  enum sched_type schedule, kmp_int32 *p_last,
2319  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2320  kmp_int32 chunk) {
2321  KMP_DEBUG_ASSERT(__kmp_init_serial);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL
2323  OMPT_STORE_RETURN_ADDRESS(gtid);
2324 #endif
2325  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2326  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2327 }
2328 
2329 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2330  enum sched_type schedule, kmp_int32 *p_last,
2331  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2332  kmp_int32 chunk) {
2333  KMP_DEBUG_ASSERT(__kmp_init_serial);
2334 #if OMPT_SUPPORT && OMPT_OPTIONAL
2335  OMPT_STORE_RETURN_ADDRESS(gtid);
2336 #endif
2337  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2338  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2339 }
2340 
2341 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2342  enum sched_type schedule, kmp_int32 *p_last,
2343  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2344  kmp_int64 chunk) {
2345  KMP_DEBUG_ASSERT(__kmp_init_serial);
2346 #if OMPT_SUPPORT && OMPT_OPTIONAL
2347  OMPT_STORE_RETURN_ADDRESS(gtid);
2348 #endif
2349  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2350  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2351 }
2352 
2353 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2354  enum sched_type schedule, kmp_int32 *p_last,
2355  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2356  kmp_int64 chunk) {
2357  KMP_DEBUG_ASSERT(__kmp_init_serial);
2358 #if OMPT_SUPPORT && OMPT_OPTIONAL
2359  OMPT_STORE_RETURN_ADDRESS(gtid);
2360 #endif
2361  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2362  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2363 }
2364 
2378 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2379  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2380 #if OMPT_SUPPORT && OMPT_OPTIONAL
2381  OMPT_STORE_RETURN_ADDRESS(gtid);
2382 #endif
2383  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2384 #if OMPT_SUPPORT && OMPT_OPTIONAL
2385  ,
2386  OMPT_LOAD_RETURN_ADDRESS(gtid)
2387 #endif
2388  );
2389 }
2390 
2394 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2395  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2396  kmp_int32 *p_st) {
2397 #if OMPT_SUPPORT && OMPT_OPTIONAL
2398  OMPT_STORE_RETURN_ADDRESS(gtid);
2399 #endif
2400  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2401 #if OMPT_SUPPORT && OMPT_OPTIONAL
2402  ,
2403  OMPT_LOAD_RETURN_ADDRESS(gtid)
2404 #endif
2405  );
2406 }
2407 
2411 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2412  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2413 #if OMPT_SUPPORT && OMPT_OPTIONAL
2414  OMPT_STORE_RETURN_ADDRESS(gtid);
2415 #endif
2416  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL
2418  ,
2419  OMPT_LOAD_RETURN_ADDRESS(gtid)
2420 #endif
2421  );
2422 }
2423 
2427 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2428  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2429  kmp_int64 *p_st) {
2430 #if OMPT_SUPPORT && OMPT_OPTIONAL
2431  OMPT_STORE_RETURN_ADDRESS(gtid);
2432 #endif
2433  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2435  ,
2436  OMPT_LOAD_RETURN_ADDRESS(gtid)
2437 #endif
2438  );
2439 }
2440 
2447 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2448  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2449 }
2450 
2454 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2455  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2456 }
2457 
2461 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2462  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2463 }
2464 
2468 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2469  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2470 }
2473 //-----------------------------------------------------------------------------
2474 // Non-template routines from kmp_dispatch.cpp used in other sources
2475 
2476 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2477  return value == checker;
2478 }
2479 
2480 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2481  return value != checker;
2482 }
2483 
2484 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2485  return value < checker;
2486 }
2487 
2488 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2489  return value >= checker;
2490 }
2491 
2492 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2493  return value <= checker;
2494 }
2495 
2496 kmp_uint32
2497 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2498  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2499  void *obj // Higher-level synchronization object, or NULL.
2500  ) {
2501  // note: we may not belong to a team at this point
2502  volatile kmp_uint32 *spin = spinner;
2503  kmp_uint32 check = checker;
2504  kmp_uint32 spins;
2505  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2506  kmp_uint32 r;
2507 
2508  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2509  KMP_INIT_YIELD(spins);
2510  // main wait spin loop
2511  while (!f(r = TCR_4(*spin), check)) {
2512  KMP_FSYNC_SPIN_PREPARE(obj);
2513  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2514  split. It causes problems with infinite recursion because of exit lock */
2515  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2516  __kmp_abort_thread(); */
2517 
2518  /* if we have waited a bit, or are oversubscribed, yield */
2519  /* pause is in the following code */
2520  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2521  KMP_YIELD_SPIN(spins);
2522  }
2523  KMP_FSYNC_SPIN_ACQUIRED(obj);
2524  return r;
2525 }
2526 
2527 void __kmp_wait_yield_4_ptr(
2528  void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2529  void *obj // Higher-level synchronization object, or NULL.
2530  ) {
2531  // note: we may not belong to a team at this point
2532  void *spin = spinner;
2533  kmp_uint32 check = checker;
2534  kmp_uint32 spins;
2535  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2536 
2537  KMP_FSYNC_SPIN_INIT(obj, spin);
2538  KMP_INIT_YIELD(spins);
2539  // main wait spin loop
2540  while (!f(spin, check)) {
2541  KMP_FSYNC_SPIN_PREPARE(obj);
2542  /* if we have waited a bit, or are oversubscribed, yield */
2543  /* pause is in the following code */
2544  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2545  KMP_YIELD_SPIN(spins);
2546  }
2547  KMP_FSYNC_SPIN_ACQUIRED(obj);
2548 }
2549 
2550 } // extern "C"
2551 
2552 #ifdef KMP_GOMP_COMPAT
2553 
2554 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2555  enum sched_type schedule, kmp_int32 lb,
2556  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2557  int push_ws) {
2558  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2559  push_ws);
2560 }
2561 
2562 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2563  enum sched_type schedule, kmp_uint32 lb,
2564  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2565  int push_ws) {
2566  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2567  push_ws);
2568 }
2569 
2570 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2571  enum sched_type schedule, kmp_int64 lb,
2572  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2573  int push_ws) {
2574  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2575  push_ws);
2576 }
2577 
2578 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2579  enum sched_type schedule, kmp_uint64 lb,
2580  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2581  int push_ws) {
2582  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2583  push_ws);
2584 }
2585 
2586 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2587  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2588 }
2589 
2590 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2591  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2592 }
2593 
2594 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2595  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2596 }
2597 
2598 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2599  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2600 }
2601 
2602 #endif /* KMP_GOMP_COMPAT */
2603 
2604 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:890
sched_type
Definition: kmp.h:320
Definition: kmp.h:207
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)