LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_dispatch.cpp
1/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13/* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20#include "kmp.h"
21#include "kmp_error.h"
22#include "kmp_i18n.h"
23#include "kmp_itt.h"
24#include "kmp_stats.h"
25#include "kmp_str.h"
26#if KMP_USE_X87CONTROL
27#include <float.h>
28#endif
29#include "kmp_lock.h"
30#include "kmp_dispatch.h"
31#if KMP_USE_HIER_SCHED
32#include "kmp_dispatch_hier.h"
33#endif
34
35#if OMPT_SUPPORT
36#include "ompt-specific.h"
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43 kmp_info_t *th;
44
45 KMP_DEBUG_ASSERT(gtid_ref);
46
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53#else
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55#endif
56 }
57 }
58}
59
60void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61 kmp_info_t *th;
62
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67 }
68 }
69}
70
71// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
75 // TODO: make nonmonotonic when static_steal is fixed
76 int monotonicity = SCHEDULE_MONOTONIC;
77
78 // Let default be monotonic for executables
79 // compiled with OpenMP* 4.5 or less compilers
80 if (loc != NULL && loc->get_openmp_version() < 50)
81 monotonicity = SCHEDULE_MONOTONIC;
82
83 if (use_hier || __kmp_force_monotonic)
84 monotonicity = SCHEDULE_MONOTONIC;
85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86 monotonicity = SCHEDULE_NONMONOTONIC;
87 else if (SCHEDULE_HAS_MONOTONIC(schedule))
88 monotonicity = SCHEDULE_MONOTONIC;
89
90 return monotonicity;
91}
92
93#if KMP_STATIC_STEAL_ENABLED
94enum { // values for steal_flag (possible states of private per-loop buffer)
95 UNUSED = 0,
96 CLAIMED = 1, // owner thread started initialization
97 READY = 2, // available for stealing
98 THIEF = 3 // finished by owner, or claimed by thief
99 // possible state changes:
100 // 0 -> 1 owner only, sync
101 // 0 -> 3 thief only, sync
102 // 1 -> 2 owner only, async
103 // 2 -> 3 owner only, async
104 // 3 -> 2 owner only, async
105 // 3 -> 0 last thread finishing the loop, async
106};
107#endif
108
109// Initialize a dispatch_private_info_template<T> buffer for a particular
110// type of schedule,chunk. The loop description is found in lb (lower bound),
111// ub (upper bound), and st (stride). nproc is the number of threads relevant
112// to the scheduling (often the number of threads in a team, but not always if
113// hierarchical scheduling is used). tid is the id of the thread calling
114// the function within the group of nproc threads. It will have a value
115// between 0 and nproc - 1. This is often just the thread id within a team, but
116// is not necessarily the case when using hierarchical scheduling.
117// loc is the source file location of the corresponding loop
118// gtid is the global thread id
119template <typename T>
120void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
121 dispatch_private_info_template<T> *pr,
122 enum sched_type schedule, T lb, T ub,
123 typename traits_t<T>::signed_t st,
124#if USE_ITT_BUILD
125 kmp_uint64 *cur_chunk,
126#endif
127 typename traits_t<T>::signed_t chunk,
128 T nproc, T tid) {
129 typedef typename traits_t<T>::unsigned_t UT;
130 typedef typename traits_t<T>::floating_t DBL;
131
132 int active;
133 T tc;
134 kmp_info_t *th;
135 kmp_team_t *team;
136 int monotonicity;
137 bool use_hier;
138
139#ifdef KMP_DEBUG
140 typedef typename traits_t<T>::signed_t ST;
141 {
142 char *buff;
143 // create format specifiers before the debug output
144 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
145 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
146 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
147 traits_t<T>::spec, traits_t<T>::spec,
148 traits_t<ST>::spec, traits_t<ST>::spec,
149 traits_t<T>::spec, traits_t<T>::spec);
150 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
151 __kmp_str_free(&buff);
152 }
153#endif
154 /* setup data */
155 th = __kmp_threads[gtid];
156 team = th->th.th_team;
157 active = !team->t.t_serialized;
158
159#if USE_ITT_BUILD
160 int itt_need_metadata_reporting =
161 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
162 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
163 team->t.t_active_level == 1;
164#endif
165
166#if KMP_USE_HIER_SCHED
167 use_hier = pr->flags.use_hier;
168#else
169 use_hier = false;
170#endif
171
172 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
173 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
174 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
175
176 /* Pick up the nomerge/ordered bits from the scheduling type */
177 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
178 pr->flags.nomerge = TRUE;
179 schedule =
180 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
181 } else {
182 pr->flags.nomerge = FALSE;
183 }
184 pr->type_size = traits_t<T>::type_size; // remember the size of variables
185 if (kmp_ord_lower & schedule) {
186 pr->flags.ordered = TRUE;
187 schedule =
188 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
189 } else {
190 pr->flags.ordered = FALSE;
191 }
192 // Ordered overrides nonmonotonic
193 if (pr->flags.ordered) {
194 monotonicity = SCHEDULE_MONOTONIC;
195 }
196
197 if (schedule == kmp_sch_static) {
198 schedule = __kmp_static;
199 } else {
200 if (schedule == kmp_sch_runtime) {
201 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
202 // not specified)
203 schedule = team->t.t_sched.r_sched_type;
204 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
205 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
206 if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
207 monotonicity = SCHEDULE_MONOTONIC;
208 // Detail the schedule if needed (global controls are differentiated
209 // appropriately)
210 if (schedule == kmp_sch_guided_chunked) {
211 schedule = __kmp_guided;
212 } else if (schedule == kmp_sch_static) {
213 schedule = __kmp_static;
214 }
215 // Use the chunk size specified by OMP_SCHEDULE (or default if not
216 // specified)
217 chunk = team->t.t_sched.chunk;
218#if USE_ITT_BUILD
219 if (cur_chunk)
220 *cur_chunk = chunk;
221#endif
222#ifdef KMP_DEBUG
223 {
224 char *buff;
225 // create format specifiers before the debug output
226 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
227 "schedule:%%d chunk:%%%s\n",
228 traits_t<ST>::spec);
229 KD_TRACE(10, (buff, gtid, schedule, chunk));
230 __kmp_str_free(&buff);
231 }
232#endif
233 } else {
234 if (schedule == kmp_sch_guided_chunked) {
235 schedule = __kmp_guided;
236 }
237 if (chunk <= 0) {
238 chunk = KMP_DEFAULT_CHUNK;
239 }
240 }
241
242 if (schedule == kmp_sch_auto) {
243 // mapping and differentiation: in the __kmp_do_serial_initialize()
244 schedule = __kmp_auto;
245#ifdef KMP_DEBUG
246 {
247 char *buff;
248 // create format specifiers before the debug output
249 buff = __kmp_str_format(
250 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
251 "schedule:%%d chunk:%%%s\n",
252 traits_t<ST>::spec);
253 KD_TRACE(10, (buff, gtid, schedule, chunk));
254 __kmp_str_free(&buff);
255 }
256#endif
257 }
258#if KMP_STATIC_STEAL_ENABLED
259 // map nonmonotonic:dynamic to static steal
260 if (schedule == kmp_sch_dynamic_chunked) {
261 if (monotonicity == SCHEDULE_NONMONOTONIC)
262 schedule = kmp_sch_static_steal;
263 }
264#endif
265 /* guided analytical not safe for too many threads */
266 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
267 schedule = kmp_sch_guided_iterative_chunked;
268 KMP_WARNING(DispatchManyThreads);
269 }
270 if (schedule == kmp_sch_runtime_simd) {
271 // compiler provides simd_width in the chunk parameter
272 schedule = team->t.t_sched.r_sched_type;
273 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
274 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
275 // Detail the schedule if needed (global controls are differentiated
276 // appropriately)
277 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
278 schedule == __kmp_static) {
279 schedule = kmp_sch_static_balanced_chunked;
280 } else {
281 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
282 schedule = kmp_sch_guided_simd;
283 }
284 chunk = team->t.t_sched.chunk * chunk;
285 }
286#if USE_ITT_BUILD
287 if (cur_chunk)
288 *cur_chunk = chunk;
289#endif
290#ifdef KMP_DEBUG
291 {
292 char *buff;
293 // create format specifiers before the debug output
294 buff = __kmp_str_format(
295 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
296 " chunk:%%%s\n",
297 traits_t<ST>::spec);
298 KD_TRACE(10, (buff, gtid, schedule, chunk));
299 __kmp_str_free(&buff);
300 }
301#endif
302 }
303 pr->u.p.parm1 = chunk;
304 }
305 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
306 "unknown scheduling type");
307
308 pr->u.p.count = 0;
309
310 if (__kmp_env_consistency_check) {
311 if (st == 0) {
312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
313 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
314 }
315 }
316 // compute trip count
317 if (st == 1) { // most common case
318 if (ub >= lb) {
319 tc = ub - lb + 1;
320 } else { // ub < lb
321 tc = 0; // zero-trip
322 }
323 } else if (st < 0) {
324 if (lb >= ub) {
325 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
326 // where the division needs to be unsigned regardless of the result type
327 tc = (UT)(lb - ub) / (-st) + 1;
328 } else { // lb < ub
329 tc = 0; // zero-trip
330 }
331 } else { // st > 0
332 if (ub >= lb) {
333 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
334 // where the division needs to be unsigned regardless of the result type
335 tc = (UT)(ub - lb) / st + 1;
336 } else { // ub < lb
337 tc = 0; // zero-trip
338 }
339 }
340
341#if KMP_STATS_ENABLED
342 if (KMP_MASTER_GTID(gtid)) {
343 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
344 }
345#endif
346
347 pr->u.p.lb = lb;
348 pr->u.p.ub = ub;
349 pr->u.p.st = st;
350 pr->u.p.tc = tc;
351
352#if KMP_OS_WINDOWS
353 pr->u.p.last_upper = ub + st;
354#endif /* KMP_OS_WINDOWS */
355
356 /* NOTE: only the active parallel region(s) has active ordered sections */
357
358 if (active) {
359 if (pr->flags.ordered) {
360 pr->ordered_bumped = 0;
361 pr->u.p.ordered_lower = 1;
362 pr->u.p.ordered_upper = 0;
363 }
364 }
365
366 switch (schedule) {
367#if KMP_STATIC_STEAL_ENABLED
368 case kmp_sch_static_steal: {
369 T ntc, init;
370
371 KD_TRACE(100,
372 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
373 gtid));
374
375 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
376 if (nproc > 1 && ntc >= nproc) {
377 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
378 T id = tid;
379 T small_chunk, extras;
380 kmp_uint32 old = UNUSED;
381 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
382 if (traits_t<T>::type_size > 4) {
383 // AC: TODO: check if 16-byte CAS available and use it to
384 // improve performance (probably wait for explicit request
385 // before spending time on this).
386 // For now use dynamically allocated per-private-buffer lock,
387 // free memory in __kmp_dispatch_next when status==0.
388 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
389 __kmp_init_lock(pr->u.p.steal_lock);
390 }
391 small_chunk = ntc / nproc;
392 extras = ntc % nproc;
393
394 init = id * small_chunk + (id < extras ? id : extras);
395 pr->u.p.count = init;
396 if (claimed) { // are we succeeded in claiming own buffer?
397 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
398 // Other threads will inspect steal_flag when searching for a victim.
399 // READY means other threads may steal from this thread from now on.
400 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
401 } else {
402 // other thread has stolen whole our range
403 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
404 pr->u.p.ub = init; // mark there is no iterations to work on
405 }
406 pr->u.p.parm2 = ntc; // save number of chunks
407 // parm3 is the number of times to attempt stealing which is
408 // nproc (just a heuristics, could be optimized later on).
409 pr->u.p.parm3 = nproc;
410 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
411 break;
412 } else {
413 /* too few chunks: switching to kmp_sch_dynamic_chunked */
414 schedule = kmp_sch_dynamic_chunked;
415 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
416 "kmp_sch_dynamic_chunked\n",
417 gtid));
418 goto dynamic_init;
419 break;
420 } // if
421 } // case
422#endif
423 case kmp_sch_static_balanced: {
424 T init, limit;
425
426 KD_TRACE(
427 100,
428 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
429 gtid));
430
431 if (nproc > 1) {
432 T id = tid;
433
434 if (tc < nproc) {
435 if (id < tc) {
436 init = id;
437 limit = id;
438 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
439 } else {
440 pr->u.p.count = 1; /* means no more chunks to execute */
441 pr->u.p.parm1 = FALSE;
442 break;
443 }
444 } else {
445 T small_chunk = tc / nproc;
446 T extras = tc % nproc;
447 init = id * small_chunk + (id < extras ? id : extras);
448 limit = init + small_chunk - (id < extras ? 0 : 1);
449 pr->u.p.parm1 = (id == nproc - 1);
450 }
451 } else {
452 if (tc > 0) {
453 init = 0;
454 limit = tc - 1;
455 pr->u.p.parm1 = TRUE;
456 } else {
457 // zero trip count
458 pr->u.p.count = 1; /* means no more chunks to execute */
459 pr->u.p.parm1 = FALSE;
460 break;
461 }
462 }
463#if USE_ITT_BUILD
464 // Calculate chunk for metadata report
465 if (itt_need_metadata_reporting)
466 if (cur_chunk)
467 *cur_chunk = limit - init + 1;
468#endif
469 if (st == 1) {
470 pr->u.p.lb = lb + init;
471 pr->u.p.ub = lb + limit;
472 } else {
473 // calculated upper bound, "ub" is user-defined upper bound
474 T ub_tmp = lb + limit * st;
475 pr->u.p.lb = lb + init * st;
476 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
477 // it exactly
478 if (st > 0) {
479 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
480 } else {
481 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
482 }
483 }
484 if (pr->flags.ordered) {
485 pr->u.p.ordered_lower = init;
486 pr->u.p.ordered_upper = limit;
487 }
488 break;
489 } // case
490 case kmp_sch_static_balanced_chunked: {
491 // similar to balanced, but chunk adjusted to multiple of simd width
492 T nth = nproc;
493 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
494 " -> falling-through to static_greedy\n",
495 gtid));
496 schedule = kmp_sch_static_greedy;
497 if (nth > 1)
498 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
499 else
500 pr->u.p.parm1 = tc;
501 break;
502 } // case
504 case kmp_sch_guided_iterative_chunked: {
505 KD_TRACE(
506 100,
507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
508 " case\n",
509 gtid));
510
511 if (nproc > 1) {
512 if ((2L * chunk + 1) * nproc >= tc) {
513 /* chunk size too large, switch to dynamic */
514 schedule = kmp_sch_dynamic_chunked;
515 goto dynamic_init;
516 } else {
517 // when remaining iters become less than parm2 - switch to dynamic
518 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
519 *(double *)&pr->u.p.parm3 =
520 guided_flt_param / (double)nproc; // may occupy parm3 and parm4
521 }
522 } else {
523 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
524 "kmp_sch_static_greedy\n",
525 gtid));
526 schedule = kmp_sch_static_greedy;
527 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
528 KD_TRACE(
529 100,
530 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
531 gtid));
532 pr->u.p.parm1 = tc;
533 } // if
534 } // case
535 break;
536 case kmp_sch_guided_analytical_chunked: {
537 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
538 "kmp_sch_guided_analytical_chunked case\n",
539 gtid));
540
541 if (nproc > 1) {
542 if ((2L * chunk + 1) * nproc >= tc) {
543 /* chunk size too large, switch to dynamic */
544 schedule = kmp_sch_dynamic_chunked;
545 goto dynamic_init;
546 } else {
547 /* commonly used term: (2 nproc - 1)/(2 nproc) */
548 DBL x;
549
550#if KMP_USE_X87CONTROL
551 /* Linux* OS already has 64-bit computation by default for long double,
552 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
553 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
554 instead of the default 53-bit. Even though long double doesn't work
555 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
556 expected to impact the correctness of the algorithm, but this has not
557 been mathematically proven. */
558 // save original FPCW and set precision to 64-bit, as
559 // Windows* OS on IA-32 architecture defaults to 53-bit
560 unsigned int oldFpcw = _control87(0, 0);
561 _control87(_PC_64, _MCW_PC); // 0,0x30000
562#endif
563 /* value used for comparison in solver for cross-over point */
564 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
565
566 /* crossover point--chunk indexes equal to or greater than
567 this point switch to dynamic-style scheduling */
568 UT cross;
569
570 /* commonly used term: (2 nproc - 1)/(2 nproc) */
571 x = 1.0 - 0.5 / (double)nproc;
572
573#ifdef KMP_DEBUG
574 { // test natural alignment
575 struct _test_a {
576 char a;
577 union {
578 char b;
579 DBL d;
580 };
581 } t;
582 ptrdiff_t natural_alignment =
583 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
584 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
585 // long)natural_alignment );
586 KMP_DEBUG_ASSERT(
587 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
588 }
589#endif // KMP_DEBUG
590
591 /* save the term in thread private dispatch structure */
592 *(DBL *)&pr->u.p.parm3 = x;
593
594 /* solve for the crossover point to the nearest integer i for which C_i
595 <= chunk */
596 {
597 UT left, right, mid;
598 long double p;
599
600 /* estimate initial upper and lower bound */
601
602 /* doesn't matter what value right is as long as it is positive, but
603 it affects performance of the solver */
604 right = 229;
605 p = __kmp_pow<UT>(x, right);
606 if (p > target) {
607 do {
608 p *= p;
609 right <<= 1;
610 } while (p > target && right < (1 << 27));
611 /* lower bound is previous (failed) estimate of upper bound */
612 left = right >> 1;
613 } else {
614 left = 0;
615 }
616
617 /* bisection root-finding method */
618 while (left + 1 < right) {
619 mid = (left + right) / 2;
620 if (__kmp_pow<UT>(x, mid) > target) {
621 left = mid;
622 } else {
623 right = mid;
624 }
625 } // while
626 cross = right;
627 }
628 /* assert sanity of computed crossover point */
629 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
630 __kmp_pow<UT>(x, cross) <= target);
631
632 /* save the crossover point in thread private dispatch structure */
633 pr->u.p.parm2 = cross;
634
635// C75803
636#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
637#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
638#else
639#define GUIDED_ANALYTICAL_WORKAROUND (x)
640#endif
641 /* dynamic-style scheduling offset */
642 pr->u.p.count = tc -
643 __kmp_dispatch_guided_remaining(
644 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
645 cross * chunk;
646#if KMP_USE_X87CONTROL
647 // restore FPCW
648 _control87(oldFpcw, _MCW_PC);
649#endif
650 } // if
651 } else {
652 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
653 "kmp_sch_static_greedy\n",
654 gtid));
655 schedule = kmp_sch_static_greedy;
656 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
657 pr->u.p.parm1 = tc;
658 } // if
659 } // case
660 break;
661 case kmp_sch_static_greedy:
662 KD_TRACE(
663 100,
664 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
665 gtid));
666 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
667 break;
668 case kmp_sch_static_chunked:
669 case kmp_sch_dynamic_chunked:
670 dynamic_init:
671 if (pr->u.p.parm1 <= 0)
672 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
673 else if (pr->u.p.parm1 > tc)
674 pr->u.p.parm1 = tc;
675 // Store the total number of chunks to prevent integer overflow during
676 // bounds calculations in the get next chunk routine.
677 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
678 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
679 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
680 gtid));
681 break;
682 case kmp_sch_trapezoidal: {
683 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
684
685 T parm1, parm2, parm3, parm4;
686 KD_TRACE(100,
687 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
688 gtid));
689
690 parm1 = chunk;
691
692 /* F : size of the first cycle */
693 parm2 = (tc / (2 * nproc));
694
695 if (parm2 < 1) {
696 parm2 = 1;
697 }
698
699 /* L : size of the last cycle. Make sure the last cycle is not larger
700 than the first cycle. */
701 if (parm1 < 1) {
702 parm1 = 1;
703 } else if (parm1 > parm2) {
704 parm1 = parm2;
705 }
706
707 /* N : number of cycles */
708 parm3 = (parm2 + parm1);
709 parm3 = (2 * tc + parm3 - 1) / parm3;
710
711 if (parm3 < 2) {
712 parm3 = 2;
713 }
714
715 /* sigma : decreasing incr of the trapezoid */
716 parm4 = (parm3 - 1);
717 parm4 = (parm2 - parm1) / parm4;
718
719 // pointless check, because parm4 >= 0 always
720 // if ( parm4 < 0 ) {
721 // parm4 = 0;
722 //}
723
724 pr->u.p.parm1 = parm1;
725 pr->u.p.parm2 = parm2;
726 pr->u.p.parm3 = parm3;
727 pr->u.p.parm4 = parm4;
728 } // case
729 break;
730
731 default: {
732 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
733 KMP_HNT(GetNewerLibrary), // Hint
734 __kmp_msg_null // Variadic argument list terminator
735 );
736 } break;
737 } // switch
738 pr->schedule = schedule;
739}
740
741#if KMP_USE_HIER_SCHED
742template <typename T>
743inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
744 typename traits_t<T>::signed_t st);
745template <>
746inline void
747__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
748 kmp_int32 ub, kmp_int32 st) {
749 __kmp_dispatch_init_hierarchy<kmp_int32>(
750 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
751 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
752}
753template <>
754inline void
755__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
756 kmp_uint32 ub, kmp_int32 st) {
757 __kmp_dispatch_init_hierarchy<kmp_uint32>(
758 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
759 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
760}
761template <>
762inline void
763__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
764 kmp_int64 ub, kmp_int64 st) {
765 __kmp_dispatch_init_hierarchy<kmp_int64>(
766 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
767 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
768}
769template <>
770inline void
771__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
772 kmp_uint64 ub, kmp_int64 st) {
773 __kmp_dispatch_init_hierarchy<kmp_uint64>(
774 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
775 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
776}
777
778// free all the hierarchy scheduling memory associated with the team
779void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
780 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
781 for (int i = 0; i < num_disp_buff; ++i) {
782 // type does not matter here so use kmp_int32
783 auto sh =
784 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
785 &team->t.t_disp_buffer[i]);
786 if (sh->hier) {
787 sh->hier->deallocate();
788 __kmp_free(sh->hier);
789 }
790 }
791}
792#endif
793
794// UT - unsigned flavor of T, ST - signed flavor of T,
795// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
796template <typename T>
797static void
798__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
799 T ub, typename traits_t<T>::signed_t st,
800 typename traits_t<T>::signed_t chunk, int push_ws) {
801 typedef typename traits_t<T>::unsigned_t UT;
802
803 int active;
804 kmp_info_t *th;
805 kmp_team_t *team;
806 kmp_uint32 my_buffer_index;
807 dispatch_private_info_template<T> *pr;
808 dispatch_shared_info_template<T> volatile *sh;
809
810 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
811 sizeof(dispatch_private_info));
812 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
813 sizeof(dispatch_shared_info));
814 __kmp_assert_valid_gtid(gtid);
815
816 if (!TCR_4(__kmp_init_parallel))
817 __kmp_parallel_initialize();
818
819 __kmp_resume_if_soft_paused();
820
821#if INCLUDE_SSC_MARKS
822 SSC_MARK_DISPATCH_INIT();
823#endif
824#ifdef KMP_DEBUG
825 typedef typename traits_t<T>::signed_t ST;
826 {
827 char *buff;
828 // create format specifiers before the debug output
829 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
830 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
831 traits_t<ST>::spec, traits_t<T>::spec,
832 traits_t<T>::spec, traits_t<ST>::spec);
833 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
834 __kmp_str_free(&buff);
835 }
836#endif
837 /* setup data */
838 th = __kmp_threads[gtid];
839 team = th->th.th_team;
840 active = !team->t.t_serialized;
841 th->th.th_ident = loc;
842
843 // Any half-decent optimizer will remove this test when the blocks are empty
844 // since the macros expand to nothing
845 // when statistics are disabled.
846 if (schedule == __kmp_static) {
847 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
848 } else {
849 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
850 }
851
852#if KMP_USE_HIER_SCHED
853 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
854 // Hierarchical scheduling does not work with ordered, so if ordered is
855 // detected, then revert back to threaded scheduling.
856 bool ordered;
857 enum sched_type my_sched = schedule;
858 my_buffer_index = th->th.th_dispatch->th_disp_index;
859 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
860 &th->th.th_dispatch
861 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
862 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
863 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
864 my_sched =
865 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
866 ordered = (kmp_ord_lower & my_sched);
867 if (pr->flags.use_hier) {
868 if (ordered) {
869 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
870 "Disabling hierarchical scheduling.\n",
871 gtid));
872 pr->flags.use_hier = FALSE;
873 }
874 }
875 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
876 // Don't use hierarchical for ordered parallel loops and don't
877 // use the runtime hierarchy if one was specified in the program
878 if (!ordered && !pr->flags.use_hier)
879 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
880 }
881#endif // KMP_USE_HIER_SCHED
882
883#if USE_ITT_BUILD
884 kmp_uint64 cur_chunk = chunk;
885 int itt_need_metadata_reporting =
886 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
887 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
888 team->t.t_active_level == 1;
889#endif
890 if (!active) {
891 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
892 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
893 } else {
894 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
895 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
896
897 my_buffer_index = th->th.th_dispatch->th_disp_index++;
898
899 /* What happens when number of threads changes, need to resize buffer? */
900 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
901 &th->th.th_dispatch
902 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
903 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
904 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
905 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
906 my_buffer_index));
907 if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
908 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
909 " sh->buffer_index:%d\n",
910 gtid, my_buffer_index, sh->buffer_index));
911 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
912 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
913 // Note: KMP_WAIT() cannot be used there: buffer index and
914 // my_buffer_index are *always* 32-bit integers.
915 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
916 "sh->buffer_index:%d\n",
917 gtid, my_buffer_index, sh->buffer_index));
918 }
919 }
920
921 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
922#if USE_ITT_BUILD
923 &cur_chunk,
924#endif
925 chunk, (T)th->th.th_team_nproc,
926 (T)th->th.th_info.ds.ds_tid);
927 if (active) {
928 if (pr->flags.ordered == 0) {
929 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
930 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
931 } else {
932 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
933 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
934 }
935 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
936 th->th.th_dispatch->th_dispatch_sh_current =
937 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
938#if USE_ITT_BUILD
939 if (pr->flags.ordered) {
940 __kmp_itt_ordered_init(gtid);
941 }
942 // Report loop metadata
943 if (itt_need_metadata_reporting) {
944 // Only report metadata by primary thread of active team at level 1
945 kmp_uint64 schedtype = 0;
946 switch (schedule) {
947 case kmp_sch_static_chunked:
948 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
949 break;
950 case kmp_sch_static_greedy:
951 cur_chunk = pr->u.p.parm1;
952 break;
953 case kmp_sch_dynamic_chunked:
954 schedtype = 1;
955 break;
956 case kmp_sch_guided_iterative_chunked:
957 case kmp_sch_guided_analytical_chunked:
959 schedtype = 2;
960 break;
961 default:
962 // Should we put this case under "static"?
963 // case kmp_sch_static_steal:
964 schedtype = 3;
965 break;
966 }
967 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
968 }
969#if KMP_USE_HIER_SCHED
970 if (pr->flags.use_hier) {
971 pr->u.p.count = 0;
972 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
973 }
974#endif // KMP_USER_HIER_SCHED
975#endif /* USE_ITT_BUILD */
976 }
977
978#ifdef KMP_DEBUG
979 {
980 char *buff;
981 // create format specifiers before the debug output
982 buff = __kmp_str_format(
983 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
984 "lb:%%%s ub:%%%s"
985 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
986 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
987 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
988 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
989 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
990 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
991 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
992 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
993 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
994 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
995 __kmp_str_free(&buff);
996 }
997#endif
998#if OMPT_SUPPORT && OMPT_OPTIONAL
999 if (ompt_enabled.ompt_callback_work) {
1000 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1001 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1002 ompt_callbacks.ompt_callback(ompt_callback_work)(
1003 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1004 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1005 }
1006#endif
1007 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1008}
1009
1010/* For ordered loops, either __kmp_dispatch_finish() should be called after
1011 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1012 * every chunk of iterations. If the ordered section(s) were not executed
1013 * for this iteration (or every iteration in this chunk), we need to set the
1014 * ordered iteration counters so that the next thread can proceed. */
1015template <typename UT>
1016static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1017 typedef typename traits_t<UT>::signed_t ST;
1018 __kmp_assert_valid_gtid(gtid);
1019 kmp_info_t *th = __kmp_threads[gtid];
1020
1021 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1022 if (!th->th.th_team->t.t_serialized) {
1023
1024 dispatch_private_info_template<UT> *pr =
1025 reinterpret_cast<dispatch_private_info_template<UT> *>(
1026 th->th.th_dispatch->th_dispatch_pr_current);
1027 dispatch_shared_info_template<UT> volatile *sh =
1028 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1029 th->th.th_dispatch->th_dispatch_sh_current);
1030 KMP_DEBUG_ASSERT(pr);
1031 KMP_DEBUG_ASSERT(sh);
1032 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1033 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1034
1035 if (pr->ordered_bumped) {
1036 KD_TRACE(
1037 1000,
1038 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1039 gtid));
1040 pr->ordered_bumped = 0;
1041 } else {
1042 UT lower = pr->u.p.ordered_lower;
1043
1044#ifdef KMP_DEBUG
1045 {
1046 char *buff;
1047 // create format specifiers before the debug output
1048 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1049 "ordered_iteration:%%%s lower:%%%s\n",
1050 traits_t<UT>::spec, traits_t<UT>::spec);
1051 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1052 __kmp_str_free(&buff);
1053 }
1054#endif
1055
1056 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1057 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1058 KMP_MB(); /* is this necessary? */
1059#ifdef KMP_DEBUG
1060 {
1061 char *buff;
1062 // create format specifiers before the debug output
1063 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1064 "ordered_iteration:%%%s lower:%%%s\n",
1065 traits_t<UT>::spec, traits_t<UT>::spec);
1066 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1067 __kmp_str_free(&buff);
1068 }
1069#endif
1070
1071 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1072 } // if
1073 } // if
1074 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1075}
1076
1077#ifdef KMP_GOMP_COMPAT
1078
1079template <typename UT>
1080static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1081 typedef typename traits_t<UT>::signed_t ST;
1082 __kmp_assert_valid_gtid(gtid);
1083 kmp_info_t *th = __kmp_threads[gtid];
1084
1085 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1086 if (!th->th.th_team->t.t_serialized) {
1087 dispatch_private_info_template<UT> *pr =
1088 reinterpret_cast<dispatch_private_info_template<UT> *>(
1089 th->th.th_dispatch->th_dispatch_pr_current);
1090 dispatch_shared_info_template<UT> volatile *sh =
1091 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1092 th->th.th_dispatch->th_dispatch_sh_current);
1093 KMP_DEBUG_ASSERT(pr);
1094 KMP_DEBUG_ASSERT(sh);
1095 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1096 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1097
1098 UT lower = pr->u.p.ordered_lower;
1099 UT upper = pr->u.p.ordered_upper;
1100 UT inc = upper - lower + 1;
1101
1102 if (pr->ordered_bumped == inc) {
1103 KD_TRACE(
1104 1000,
1105 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1106 gtid));
1107 pr->ordered_bumped = 0;
1108 } else {
1109 inc -= pr->ordered_bumped;
1110
1111#ifdef KMP_DEBUG
1112 {
1113 char *buff;
1114 // create format specifiers before the debug output
1115 buff = __kmp_str_format(
1116 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1117 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1118 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1119 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1120 __kmp_str_free(&buff);
1121 }
1122#endif
1123
1124 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1125 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1126
1127 KMP_MB(); /* is this necessary? */
1128 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1129 "ordered_bumped to zero\n",
1130 gtid));
1131 pr->ordered_bumped = 0;
1133#ifdef KMP_DEBUG
1134 {
1135 char *buff;
1136 // create format specifiers before the debug output
1137 buff = __kmp_str_format(
1138 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1139 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1140 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1141 traits_t<UT>::spec);
1142 KD_TRACE(1000,
1143 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1144 __kmp_str_free(&buff);
1145 }
1146#endif
1147
1148 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1149 }
1150 // }
1151 }
1152 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1153}
1154
1155#endif /* KMP_GOMP_COMPAT */
1156
1157template <typename T>
1158int __kmp_dispatch_next_algorithm(int gtid,
1159 dispatch_private_info_template<T> *pr,
1160 dispatch_shared_info_template<T> volatile *sh,
1161 kmp_int32 *p_last, T *p_lb, T *p_ub,
1162 typename traits_t<T>::signed_t *p_st, T nproc,
1163 T tid) {
1164 typedef typename traits_t<T>::unsigned_t UT;
1165 typedef typename traits_t<T>::signed_t ST;
1166 typedef typename traits_t<T>::floating_t DBL;
1167 int status = 0;
1168 bool last = false;
1169 T start;
1170 ST incr;
1171 UT limit, trip, init;
1172 kmp_info_t *th = __kmp_threads[gtid];
1173 kmp_team_t *team = th->th.th_team;
1174
1175 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1176 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1177 KMP_DEBUG_ASSERT(pr);
1178 KMP_DEBUG_ASSERT(sh);
1179 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1180#ifdef KMP_DEBUG
1181 {
1182 char *buff;
1183 // create format specifiers before the debug output
1184 buff =
1185 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1186 "sh:%%p nproc:%%%s tid:%%%s\n",
1187 traits_t<T>::spec, traits_t<T>::spec);
1188 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1189 __kmp_str_free(&buff);
1190 }
1191#endif
1192
1193 // zero trip count
1194 if (pr->u.p.tc == 0) {
1195 KD_TRACE(10,
1196 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1197 "zero status:%d\n",
1198 gtid, status));
1199 return 0;
1200 }
1201
1202 switch (pr->schedule) {
1203#if KMP_STATIC_STEAL_ENABLED
1204 case kmp_sch_static_steal: {
1205 T chunk = pr->u.p.parm1;
1206 UT nchunks = pr->u.p.parm2;
1207 KD_TRACE(100,
1208 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1209 gtid));
1210
1211 trip = pr->u.p.tc - 1;
1212
1213 if (traits_t<T>::type_size > 4) {
1214 // use lock for 8-byte induction variable.
1215 // TODO (optional): check presence and use 16-byte CAS
1216 kmp_lock_t *lck = pr->u.p.steal_lock;
1217 KMP_DEBUG_ASSERT(lck != NULL);
1218 if (pr->u.p.count < (UT)pr->u.p.ub) {
1219 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1220 __kmp_acquire_lock(lck, gtid);
1221 // try to get own chunk of iterations
1222 init = (pr->u.p.count)++;
1223 status = (init < (UT)pr->u.p.ub);
1224 __kmp_release_lock(lck, gtid);
1225 } else {
1226 status = 0; // no own chunks
1227 }
1228 if (!status) { // try to steal
1229 kmp_lock_t *lckv; // victim buffer's lock
1230 T while_limit = pr->u.p.parm3;
1231 T while_index = 0;
1232 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1233 __kmp_dispatch_num_buffers; // current loop index
1234 // note: victim thread can potentially execute another loop
1235 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1236 while ((!status) && (while_limit != ++while_index)) {
1237 dispatch_private_info_template<T> *v;
1238 T remaining;
1239 T victimId = pr->u.p.parm4;
1240 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1241 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1242 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1243 KMP_DEBUG_ASSERT(v);
1244 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1245 oldVictimId != victimId) {
1246 victimId = (victimId + 1) % nproc;
1247 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1248 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1249 KMP_DEBUG_ASSERT(v);
1250 }
1251 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1252 continue; // try once more (nproc attempts in total)
1253 }
1254 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1255 kmp_uint32 old = UNUSED;
1256 // try to steal whole range from inactive victim
1257 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1258 if (status) {
1259 // initialize self buffer with victim's whole range of chunks
1260 T id = victimId;
1261 T small_chunk, extras;
1262 small_chunk = nchunks / nproc; // chunks per thread
1263 extras = nchunks % nproc;
1264 init = id * small_chunk + (id < extras ? id : extras);
1265 __kmp_acquire_lock(lck, gtid);
1266 pr->u.p.count = init + 1; // exclude one we execute immediately
1267 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1268 __kmp_release_lock(lck, gtid);
1269 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1270 // no need to reinitialize other thread invariants: lb, st, etc.
1271#ifdef KMP_DEBUG
1272 {
1273 char *buff;
1274 // create format specifiers before the debug output
1275 buff = __kmp_str_format(
1276 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1277 "count:%%%s ub:%%%s\n",
1278 traits_t<UT>::spec, traits_t<T>::spec);
1279 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1280 __kmp_str_free(&buff);
1281 }
1282#endif
1283 // activate non-empty buffer and let others steal from us
1284 if (pr->u.p.count < (UT)pr->u.p.ub)
1285 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1286 break;
1287 }
1288 }
1289 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
1290 v->u.p.count >= (UT)v->u.p.ub) {
1291 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1292 continue; // no chunks to steal, try next victim
1293 }
1294 lckv = v->u.p.steal_lock;
1295 KMP_ASSERT(lckv != NULL);
1296 __kmp_acquire_lock(lckv, gtid);
1297 limit = v->u.p.ub; // keep initial ub
1298 if (v->u.p.count >= limit) {
1299 __kmp_release_lock(lckv, gtid);
1300 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1301 continue; // no chunks to steal, try next victim
1302 }
1303
1304 // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1305 // TODO: is this heuristics good enough??
1306 remaining = limit - v->u.p.count;
1307 if (remaining > 7) {
1308 // steal 1/4 of remaining
1309 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1310 init = (v->u.p.ub -= (remaining >> 2));
1311 } else {
1312 // steal 1 chunk of 1..7 remaining
1313 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1314 init = (v->u.p.ub -= 1);
1315 }
1316 __kmp_release_lock(lckv, gtid);
1317#ifdef KMP_DEBUG
1318 {
1319 char *buff;
1320 // create format specifiers before the debug output
1321 buff = __kmp_str_format(
1322 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1323 "count:%%%s ub:%%%s\n",
1324 traits_t<UT>::spec, traits_t<UT>::spec);
1325 KD_TRACE(10, (buff, gtid, victimId, init, limit));
1326 __kmp_str_free(&buff);
1327 }
1328#endif
1329 KMP_DEBUG_ASSERT(init + 1 <= limit);
1330 pr->u.p.parm4 = victimId; // remember victim to steal from
1331 status = 1;
1332 // now update own count and ub with stolen range excluding init chunk
1333 __kmp_acquire_lock(lck, gtid);
1334 pr->u.p.count = init + 1;
1335 pr->u.p.ub = limit;
1336 __kmp_release_lock(lck, gtid);
1337 // activate non-empty buffer and let others steal from us
1338 if (init + 1 < limit)
1339 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1340 } // while (search for victim)
1341 } // if (try to find victim and steal)
1342 } else {
1343 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1344 // as all operations on pair (count, ub) must be done atomically
1345 typedef union {
1346 struct {
1347 UT count;
1348 T ub;
1349 } p;
1350 kmp_int64 b;
1351 } union_i4;
1352 union_i4 vold, vnew;
1353 if (pr->u.p.count < (UT)pr->u.p.ub) {
1354 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1355 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1356 vnew.b = vold.b;
1357 vnew.p.count++; // get chunk from head of self range
1358 while (!KMP_COMPARE_AND_STORE_REL64(
1359 (volatile kmp_int64 *)&pr->u.p.count,
1360 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1361 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1362 KMP_CPU_PAUSE();
1363 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1364 vnew.b = vold.b;
1365 vnew.p.count++;
1366 }
1367 init = vold.p.count;
1368 status = (init < (UT)vold.p.ub);
1369 } else {
1370 status = 0; // no own chunks
1371 }
1372 if (!status) { // try to steal
1373 T while_limit = pr->u.p.parm3;
1374 T while_index = 0;
1375 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1376 __kmp_dispatch_num_buffers; // current loop index
1377 // note: victim thread can potentially execute another loop
1378 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1379 while ((!status) && (while_limit != ++while_index)) {
1380 dispatch_private_info_template<T> *v;
1381 T remaining;
1382 T victimId = pr->u.p.parm4;
1383 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1384 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1385 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1386 KMP_DEBUG_ASSERT(v);
1387 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1388 oldVictimId != victimId) {
1389 victimId = (victimId + 1) % nproc;
1390 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1391 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1392 KMP_DEBUG_ASSERT(v);
1393 }
1394 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1395 continue; // try once more (nproc attempts in total)
1396 }
1397 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1398 kmp_uint32 old = UNUSED;
1399 // try to steal whole range from inactive victim
1400 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1401 if (status) {
1402 // initialize self buffer with victim's whole range of chunks
1403 T id = victimId;
1404 T small_chunk, extras;
1405 small_chunk = nchunks / nproc; // chunks per thread
1406 extras = nchunks % nproc;
1407 init = id * small_chunk + (id < extras ? id : extras);
1408 vnew.p.count = init + 1;
1409 vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1410 // write pair (count, ub) at once atomically
1411#if KMP_ARCH_X86
1412 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1413#else
1414 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1415#endif
1416 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1417 // no need to initialize other thread invariants: lb, st, etc.
1418#ifdef KMP_DEBUG
1419 {
1420 char *buff;
1421 // create format specifiers before the debug output
1422 buff = __kmp_str_format(
1423 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1424 "count:%%%s ub:%%%s\n",
1425 traits_t<UT>::spec, traits_t<T>::spec);
1426 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1427 __kmp_str_free(&buff);
1428 }
1429#endif
1430 // activate non-empty buffer and let others steal from us
1431 if (pr->u.p.count < (UT)pr->u.p.ub)
1432 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1433 break;
1434 }
1435 }
1436 while (1) { // CAS loop with check if victim still has enough chunks
1437 // many threads may be stealing concurrently from same victim
1438 vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1439 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1440 vold.p.count >= (UT)vold.p.ub) {
1441 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1442 break; // no chunks to steal, try next victim
1443 }
1444 vnew.b = vold.b;
1445 remaining = vold.p.ub - vold.p.count;
1446 // try to steal 1/4 of remaining
1447 // TODO: is this heuristics good enough??
1448 if (remaining > 7) {
1449 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1450 } else {
1451 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1452 }
1453 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1454 if (KMP_COMPARE_AND_STORE_REL64(
1455 (volatile kmp_int64 *)&v->u.p.count,
1456 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1457 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1458 // stealing succedded
1459#ifdef KMP_DEBUG
1460 {
1461 char *buff;
1462 // create format specifiers before the debug output
1463 buff = __kmp_str_format(
1464 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1465 "count:%%%s ub:%%%s\n",
1466 traits_t<T>::spec, traits_t<T>::spec);
1467 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1468 __kmp_str_free(&buff);
1469 }
1470#endif
1471 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1472 vold.p.ub - vnew.p.ub);
1473 status = 1;
1474 pr->u.p.parm4 = victimId; // keep victim id
1475 // now update own count and ub
1476 init = vnew.p.ub;
1477 vold.p.count = init + 1;
1478#if KMP_ARCH_X86
1479 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1480#else
1481 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1482#endif
1483 // activate non-empty buffer and let others steal from us
1484 if (vold.p.count < (UT)vold.p.ub)
1485 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1486 break;
1487 } // if (check CAS result)
1488 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1489 } // while (try to steal from particular victim)
1490 } // while (search for victim)
1491 } // if (try to find victim and steal)
1492 } // if (4-byte induction variable)
1493 if (!status) {
1494 *p_lb = 0;
1495 *p_ub = 0;
1496 if (p_st != NULL)
1497 *p_st = 0;
1498 } else {
1499 start = pr->u.p.lb;
1500 init *= chunk;
1501 limit = chunk + init - 1;
1502 incr = pr->u.p.st;
1503 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1504
1505 KMP_DEBUG_ASSERT(init <= trip);
1506 // keep track of done chunks for possible early exit from stealing
1507 // TODO: count executed chunks locally with rare update of shared location
1508 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1509 if ((last = (limit >= trip)) != 0)
1510 limit = trip;
1511 if (p_st != NULL)
1512 *p_st = incr;
1513
1514 if (incr == 1) {
1515 *p_lb = start + init;
1516 *p_ub = start + limit;
1517 } else {
1518 *p_lb = start + init * incr;
1519 *p_ub = start + limit * incr;
1520 }
1521 } // if
1522 break;
1523 } // case
1524#endif // KMP_STATIC_STEAL_ENABLED
1525 case kmp_sch_static_balanced: {
1526 KD_TRACE(
1527 10,
1528 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1529 gtid));
1530 /* check if thread has any iteration to do */
1531 if ((status = !pr->u.p.count) != 0) {
1532 pr->u.p.count = 1;
1533 *p_lb = pr->u.p.lb;
1534 *p_ub = pr->u.p.ub;
1535 last = (pr->u.p.parm1 != 0);
1536 if (p_st != NULL)
1537 *p_st = pr->u.p.st;
1538 } else { /* no iterations to do */
1539 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1540 }
1541 } // case
1542 break;
1543 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1544 merged here */
1545 case kmp_sch_static_chunked: {
1546 T parm1;
1547
1548 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1549 "kmp_sch_static_[affinity|chunked] case\n",
1550 gtid));
1551 parm1 = pr->u.p.parm1;
1552
1553 trip = pr->u.p.tc - 1;
1554 init = parm1 * (pr->u.p.count + tid);
1555
1556 if ((status = (init <= trip)) != 0) {
1557 start = pr->u.p.lb;
1558 incr = pr->u.p.st;
1559 limit = parm1 + init - 1;
1560
1561 if ((last = (limit >= trip)) != 0)
1562 limit = trip;
1563
1564 if (p_st != NULL)
1565 *p_st = incr;
1566
1567 pr->u.p.count += nproc;
1568
1569 if (incr == 1) {
1570 *p_lb = start + init;
1571 *p_ub = start + limit;
1572 } else {
1573 *p_lb = start + init * incr;
1574 *p_ub = start + limit * incr;
1575 }
1576
1577 if (pr->flags.ordered) {
1578 pr->u.p.ordered_lower = init;
1579 pr->u.p.ordered_upper = limit;
1580 } // if
1581 } // if
1582 } // case
1583 break;
1584
1585 case kmp_sch_dynamic_chunked: {
1586 UT chunk_number;
1587 UT chunk_size = pr->u.p.parm1;
1588 UT nchunks = pr->u.p.parm2;
1589
1590 KD_TRACE(
1591 100,
1592 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1593 gtid));
1594
1595 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1596 status = (chunk_number < nchunks);
1597 if (!status) {
1598 *p_lb = 0;
1599 *p_ub = 0;
1600 if (p_st != NULL)
1601 *p_st = 0;
1602 } else {
1603 init = chunk_size * chunk_number;
1604 trip = pr->u.p.tc - 1;
1605 start = pr->u.p.lb;
1606 incr = pr->u.p.st;
1607
1608 if ((last = (trip - init < (UT)chunk_size)))
1609 limit = trip;
1610 else
1611 limit = chunk_size + init - 1;
1612
1613 if (p_st != NULL)
1614 *p_st = incr;
1615
1616 if (incr == 1) {
1617 *p_lb = start + init;
1618 *p_ub = start + limit;
1619 } else {
1620 *p_lb = start + init * incr;
1621 *p_ub = start + limit * incr;
1622 }
1623
1624 if (pr->flags.ordered) {
1625 pr->u.p.ordered_lower = init;
1626 pr->u.p.ordered_upper = limit;
1627 } // if
1628 } // if
1629 } // case
1630 break;
1631
1632 case kmp_sch_guided_iterative_chunked: {
1633 T chunkspec = pr->u.p.parm1;
1634 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1635 "iterative case\n",
1636 gtid));
1637 trip = pr->u.p.tc;
1638 // Start atomic part of calculations
1639 while (1) {
1640 ST remaining; // signed, because can be < 0
1641 init = sh->u.s.iteration; // shared value
1642 remaining = trip - init;
1643 if (remaining <= 0) { // AC: need to compare with 0 first
1644 // nothing to do, don't try atomic op
1645 status = 0;
1646 break;
1647 }
1648 if ((T)remaining <
1649 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1650 // use dynamic-style schedule
1651 // atomically increment iterations, get old value
1652 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1653 (ST)chunkspec);
1654 remaining = trip - init;
1655 if (remaining <= 0) {
1656 status = 0; // all iterations got by other threads
1657 } else {
1658 // got some iterations to work on
1659 status = 1;
1660 if ((T)remaining > chunkspec) {
1661 limit = init + chunkspec - 1;
1662 } else {
1663 last = true; // the last chunk
1664 limit = init + remaining - 1;
1665 } // if
1666 } // if
1667 break;
1668 } // if
1669 limit = init + (UT)((double)remaining *
1670 *(double *)&pr->u.p.parm3); // divide by K*nproc
1671 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1672 (ST)init, (ST)limit)) {
1673 // CAS was successful, chunk obtained
1674 status = 1;
1675 --limit;
1676 break;
1677 } // if
1678 } // while
1679 if (status != 0) {
1680 start = pr->u.p.lb;
1681 incr = pr->u.p.st;
1682 if (p_st != NULL)
1683 *p_st = incr;
1684 *p_lb = start + init * incr;
1685 *p_ub = start + limit * incr;
1686 if (pr->flags.ordered) {
1687 pr->u.p.ordered_lower = init;
1688 pr->u.p.ordered_upper = limit;
1689 } // if
1690 } else {
1691 *p_lb = 0;
1692 *p_ub = 0;
1693 if (p_st != NULL)
1694 *p_st = 0;
1695 } // if
1696 } // case
1697 break;
1698
1699 case kmp_sch_guided_simd: {
1700 // same as iterative but curr-chunk adjusted to be multiple of given
1701 // chunk
1702 T chunk = pr->u.p.parm1;
1703 KD_TRACE(100,
1704 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1705 gtid));
1706 trip = pr->u.p.tc;
1707 // Start atomic part of calculations
1708 while (1) {
1709 ST remaining; // signed, because can be < 0
1710 init = sh->u.s.iteration; // shared value
1711 remaining = trip - init;
1712 if (remaining <= 0) { // AC: need to compare with 0 first
1713 status = 0; // nothing to do, don't try atomic op
1714 break;
1715 }
1716 KMP_DEBUG_ASSERT(init % chunk == 0);
1717 // compare with K*nproc*(chunk+1), K=2 by default
1718 if ((T)remaining < pr->u.p.parm2) {
1719 // use dynamic-style schedule
1720 // atomically increment iterations, get old value
1721 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1722 (ST)chunk);
1723 remaining = trip - init;
1724 if (remaining <= 0) {
1725 status = 0; // all iterations got by other threads
1726 } else {
1727 // got some iterations to work on
1728 status = 1;
1729 if ((T)remaining > chunk) {
1730 limit = init + chunk - 1;
1731 } else {
1732 last = true; // the last chunk
1733 limit = init + remaining - 1;
1734 } // if
1735 } // if
1736 break;
1737 } // if
1738 // divide by K*nproc
1739 UT span;
1740 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1741 &span);
1742 UT rem = span % chunk;
1743 if (rem) // adjust so that span%chunk == 0
1744 span += chunk - rem;
1745 limit = init + span;
1746 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1747 (ST)init, (ST)limit)) {
1748 // CAS was successful, chunk obtained
1749 status = 1;
1750 --limit;
1751 break;
1752 } // if
1753 } // while
1754 if (status != 0) {
1755 start = pr->u.p.lb;
1756 incr = pr->u.p.st;
1757 if (p_st != NULL)
1758 *p_st = incr;
1759 *p_lb = start + init * incr;
1760 *p_ub = start + limit * incr;
1761 if (pr->flags.ordered) {
1762 pr->u.p.ordered_lower = init;
1763 pr->u.p.ordered_upper = limit;
1764 } // if
1765 } else {
1766 *p_lb = 0;
1767 *p_ub = 0;
1768 if (p_st != NULL)
1769 *p_st = 0;
1770 } // if
1771 } // case
1772 break;
1773
1774 case kmp_sch_guided_analytical_chunked: {
1775 T chunkspec = pr->u.p.parm1;
1776 UT chunkIdx;
1777#if KMP_USE_X87CONTROL
1778 /* for storing original FPCW value for Windows* OS on
1779 IA-32 architecture 8-byte version */
1780 unsigned int oldFpcw;
1781 unsigned int fpcwSet = 0;
1782#endif
1783 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1784 "kmp_sch_guided_analytical_chunked case\n",
1785 gtid));
1786
1787 trip = pr->u.p.tc;
1788
1789 KMP_DEBUG_ASSERT(nproc > 1);
1790 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1791
1792 while (1) { /* this while loop is a safeguard against unexpected zero
1793 chunk sizes */
1794 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1795 if (chunkIdx >= (UT)pr->u.p.parm2) {
1796 --trip;
1797 /* use dynamic-style scheduling */
1798 init = chunkIdx * chunkspec + pr->u.p.count;
1799 /* need to verify init > 0 in case of overflow in the above
1800 * calculation */
1801 if ((status = (init > 0 && init <= trip)) != 0) {
1802 limit = init + chunkspec - 1;
1803
1804 if ((last = (limit >= trip)) != 0)
1805 limit = trip;
1806 }
1807 break;
1808 } else {
1809/* use exponential-style scheduling */
1810/* The following check is to workaround the lack of long double precision on
1811 Windows* OS.
1812 This check works around the possible effect that init != 0 for chunkIdx == 0.
1813 */
1814#if KMP_USE_X87CONTROL
1815 /* If we haven't already done so, save original
1816 FPCW and set precision to 64-bit, as Windows* OS
1817 on IA-32 architecture defaults to 53-bit */
1818 if (!fpcwSet) {
1819 oldFpcw = _control87(0, 0);
1820 _control87(_PC_64, _MCW_PC);
1821 fpcwSet = 0x30000;
1822 }
1823#endif
1824 if (chunkIdx) {
1825 init = __kmp_dispatch_guided_remaining<T>(
1826 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1827 KMP_DEBUG_ASSERT(init);
1828 init = trip - init;
1829 } else
1830 init = 0;
1831 limit = trip - __kmp_dispatch_guided_remaining<T>(
1832 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1833 KMP_ASSERT(init <= limit);
1834 if (init < limit) {
1835 KMP_DEBUG_ASSERT(limit <= trip);
1836 --limit;
1837 status = 1;
1838 break;
1839 } // if
1840 } // if
1841 } // while (1)
1842#if KMP_USE_X87CONTROL
1843 /* restore FPCW if necessary
1844 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1845 */
1846 if (fpcwSet && (oldFpcw & fpcwSet))
1847 _control87(oldFpcw, _MCW_PC);
1848#endif
1849 if (status != 0) {
1850 start = pr->u.p.lb;
1851 incr = pr->u.p.st;
1852 if (p_st != NULL)
1853 *p_st = incr;
1854 *p_lb = start + init * incr;
1855 *p_ub = start + limit * incr;
1856 if (pr->flags.ordered) {
1857 pr->u.p.ordered_lower = init;
1858 pr->u.p.ordered_upper = limit;
1859 }
1860 } else {
1861 *p_lb = 0;
1862 *p_ub = 0;
1863 if (p_st != NULL)
1864 *p_st = 0;
1865 }
1866 } // case
1867 break;
1868
1869 case kmp_sch_trapezoidal: {
1870 UT index;
1871 T parm2 = pr->u.p.parm2;
1872 T parm3 = pr->u.p.parm3;
1873 T parm4 = pr->u.p.parm4;
1874 KD_TRACE(100,
1875 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1876 gtid));
1877
1878 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1879
1880 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1881 trip = pr->u.p.tc - 1;
1882
1883 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1884 *p_lb = 0;
1885 *p_ub = 0;
1886 if (p_st != NULL)
1887 *p_st = 0;
1888 } else {
1889 start = pr->u.p.lb;
1890 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1891 incr = pr->u.p.st;
1892
1893 if ((last = (limit >= trip)) != 0)
1894 limit = trip;
1895
1896 if (p_st != NULL)
1897 *p_st = incr;
1898
1899 if (incr == 1) {
1900 *p_lb = start + init;
1901 *p_ub = start + limit;
1902 } else {
1903 *p_lb = start + init * incr;
1904 *p_ub = start + limit * incr;
1905 }
1906
1907 if (pr->flags.ordered) {
1908 pr->u.p.ordered_lower = init;
1909 pr->u.p.ordered_upper = limit;
1910 } // if
1911 } // if
1912 } // case
1913 break;
1914 default: {
1915 status = 0; // to avoid complaints on uninitialized variable use
1916 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1917 KMP_HNT(GetNewerLibrary), // Hint
1918 __kmp_msg_null // Variadic argument list terminator
1919 );
1920 } break;
1921 } // switch
1922 if (p_last)
1923 *p_last = last;
1924#ifdef KMP_DEBUG
1925 if (pr->flags.ordered) {
1926 char *buff;
1927 // create format specifiers before the debug output
1928 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1929 "ordered_lower:%%%s ordered_upper:%%%s\n",
1930 traits_t<UT>::spec, traits_t<UT>::spec);
1931 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1932 __kmp_str_free(&buff);
1933 }
1934 {
1935 char *buff;
1936 // create format specifiers before the debug output
1937 buff = __kmp_str_format(
1938 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1939 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1940 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1941 KMP_DEBUG_ASSERT(p_last);
1942 KMP_DEBUG_ASSERT(p_st);
1943 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1944 __kmp_str_free(&buff);
1945 }
1946#endif
1947 return status;
1948}
1949
1950/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1951 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1952 is not called. */
1953#if OMPT_SUPPORT && OMPT_OPTIONAL
1954#define OMPT_LOOP_END \
1955 if (status == 0) { \
1956 if (ompt_enabled.ompt_callback_work) { \
1957 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1958 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1959 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1960 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1961 &(task_info->task_data), 0, codeptr); \
1962 } \
1963 }
1964// TODO: implement count
1965#else
1966#define OMPT_LOOP_END // no-op
1967#endif
1968
1969#if KMP_STATS_ENABLED
1970#define KMP_STATS_LOOP_END \
1971 { \
1972 kmp_int64 u, l, t, i; \
1973 l = (kmp_int64)(*p_lb); \
1974 u = (kmp_int64)(*p_ub); \
1975 i = (kmp_int64)(pr->u.p.st); \
1976 if (status == 0) { \
1977 t = 0; \
1978 KMP_POP_PARTITIONED_TIMER(); \
1979 } else if (i == 1) { \
1980 if (u >= l) \
1981 t = u - l + 1; \
1982 else \
1983 t = 0; \
1984 } else if (i < 0) { \
1985 if (l >= u) \
1986 t = (l - u) / (-i) + 1; \
1987 else \
1988 t = 0; \
1989 } else { \
1990 if (u >= l) \
1991 t = (u - l) / i + 1; \
1992 else \
1993 t = 0; \
1994 } \
1995 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1996 }
1997#else
1998#define KMP_STATS_LOOP_END /* Nothing */
1999#endif
2000
2001template <typename T>
2002static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2003 T *p_lb, T *p_ub,
2004 typename traits_t<T>::signed_t *p_st
2005#if OMPT_SUPPORT && OMPT_OPTIONAL
2006 ,
2007 void *codeptr
2008#endif
2009) {
2010
2011 typedef typename traits_t<T>::unsigned_t UT;
2012 typedef typename traits_t<T>::signed_t ST;
2013 // This is potentially slightly misleading, schedule(runtime) will appear here
2014 // even if the actual runtime schedule is static. (Which points out a
2015 // disadvantage of schedule(runtime): even when static scheduling is used it
2016 // costs more than a compile time choice to use static scheduling would.)
2017 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2018
2019 int status;
2020 dispatch_private_info_template<T> *pr;
2021 __kmp_assert_valid_gtid(gtid);
2022 kmp_info_t *th = __kmp_threads[gtid];
2023 kmp_team_t *team = th->th.th_team;
2024
2025 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2026 KD_TRACE(
2027 1000,
2028 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2029 gtid, p_lb, p_ub, p_st, p_last));
2030
2031 if (team->t.t_serialized) {
2032 /* NOTE: serialize this dispatch because we are not at the active level */
2033 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2034 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2035 KMP_DEBUG_ASSERT(pr);
2036
2037 if ((status = (pr->u.p.tc != 0)) == 0) {
2038 *p_lb = 0;
2039 *p_ub = 0;
2040 // if ( p_last != NULL )
2041 // *p_last = 0;
2042 if (p_st != NULL)
2043 *p_st = 0;
2044 if (__kmp_env_consistency_check) {
2045 if (pr->pushed_ws != ct_none) {
2046 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2047 }
2048 }
2049 } else if (pr->flags.nomerge) {
2050 kmp_int32 last;
2051 T start;
2052 UT limit, trip, init;
2053 ST incr;
2054 T chunk = pr->u.p.parm1;
2055
2056 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2057 gtid));
2058
2059 init = chunk * pr->u.p.count++;
2060 trip = pr->u.p.tc - 1;
2061
2062 if ((status = (init <= trip)) == 0) {
2063 *p_lb = 0;
2064 *p_ub = 0;
2065 // if ( p_last != NULL )
2066 // *p_last = 0;
2067 if (p_st != NULL)
2068 *p_st = 0;
2069 if (__kmp_env_consistency_check) {
2070 if (pr->pushed_ws != ct_none) {
2071 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2072 }
2073 }
2074 } else {
2075 start = pr->u.p.lb;
2076 limit = chunk + init - 1;
2077 incr = pr->u.p.st;
2078
2079 if ((last = (limit >= trip)) != 0) {
2080 limit = trip;
2081#if KMP_OS_WINDOWS
2082 pr->u.p.last_upper = pr->u.p.ub;
2083#endif /* KMP_OS_WINDOWS */
2084 }
2085 if (p_last != NULL)
2086 *p_last = last;
2087 if (p_st != NULL)
2088 *p_st = incr;
2089 if (incr == 1) {
2090 *p_lb = start + init;
2091 *p_ub = start + limit;
2092 } else {
2093 *p_lb = start + init * incr;
2094 *p_ub = start + limit * incr;
2095 }
2096
2097 if (pr->flags.ordered) {
2098 pr->u.p.ordered_lower = init;
2099 pr->u.p.ordered_upper = limit;
2100#ifdef KMP_DEBUG
2101 {
2102 char *buff;
2103 // create format specifiers before the debug output
2104 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2105 "ordered_lower:%%%s ordered_upper:%%%s\n",
2106 traits_t<UT>::spec, traits_t<UT>::spec);
2107 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2108 pr->u.p.ordered_upper));
2109 __kmp_str_free(&buff);
2110 }
2111#endif
2112 } // if
2113 } // if
2114 } else {
2115 pr->u.p.tc = 0;
2116 *p_lb = pr->u.p.lb;
2117 *p_ub = pr->u.p.ub;
2118#if KMP_OS_WINDOWS
2119 pr->u.p.last_upper = *p_ub;
2120#endif /* KMP_OS_WINDOWS */
2121 if (p_last != NULL)
2122 *p_last = TRUE;
2123 if (p_st != NULL)
2124 *p_st = pr->u.p.st;
2125 } // if
2126#ifdef KMP_DEBUG
2127 {
2128 char *buff;
2129 // create format specifiers before the debug output
2130 buff = __kmp_str_format(
2131 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2132 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2133 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2134 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2135 (p_last ? *p_last : 0), status));
2136 __kmp_str_free(&buff);
2137 }
2138#endif
2139#if INCLUDE_SSC_MARKS
2140 SSC_MARK_DISPATCH_NEXT();
2141#endif
2142 OMPT_LOOP_END;
2143 KMP_STATS_LOOP_END;
2144 return status;
2145 } else {
2146 kmp_int32 last = 0;
2147 dispatch_shared_info_template<T> volatile *sh;
2148
2149 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2150 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2151
2152 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2153 th->th.th_dispatch->th_dispatch_pr_current);
2154 KMP_DEBUG_ASSERT(pr);
2155 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2156 th->th.th_dispatch->th_dispatch_sh_current);
2157 KMP_DEBUG_ASSERT(sh);
2158
2159#if KMP_USE_HIER_SCHED
2160 if (pr->flags.use_hier)
2161 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2162 else
2163#endif // KMP_USE_HIER_SCHED
2164 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2165 p_st, th->th.th_team_nproc,
2166 th->th.th_info.ds.ds_tid);
2167 // status == 0: no more iterations to execute
2168 if (status == 0) {
2169 ST num_done;
2170 num_done = test_then_inc<ST>(&sh->u.s.num_done);
2171#ifdef KMP_DEBUG
2172 {
2173 char *buff;
2174 // create format specifiers before the debug output
2175 buff = __kmp_str_format(
2176 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2177 traits_t<ST>::spec);
2178 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2179 __kmp_str_free(&buff);
2180 }
2181#endif
2182
2183#if KMP_USE_HIER_SCHED
2184 pr->flags.use_hier = FALSE;
2185#endif
2186 if (num_done == th->th.th_team_nproc - 1) {
2187#if KMP_STATIC_STEAL_ENABLED
2188 if (pr->schedule == kmp_sch_static_steal) {
2189 int i;
2190 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2191 __kmp_dispatch_num_buffers; // current loop index
2192 // loop complete, safe to destroy locks used for stealing
2193 for (i = 0; i < th->th.th_team_nproc; ++i) {
2194 dispatch_private_info_template<T> *buf =
2195 reinterpret_cast<dispatch_private_info_template<T> *>(
2196 &team->t.t_dispatch[i].th_disp_buffer[idx]);
2197 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2198 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2199 if (traits_t<T>::type_size > 4) {
2200 // destroy locks used for stealing
2201 kmp_lock_t *lck = buf->u.p.steal_lock;
2202 KMP_ASSERT(lck != NULL);
2203 __kmp_destroy_lock(lck);
2204 __kmp_free(lck);
2205 buf->u.p.steal_lock = NULL;
2206 }
2207 }
2208 }
2209#endif
2210 /* NOTE: release shared buffer to be reused */
2211
2212 KMP_MB(); /* Flush all pending memory write invalidates. */
2213
2214 sh->u.s.num_done = 0;
2215 sh->u.s.iteration = 0;
2216
2217 /* TODO replace with general release procedure? */
2218 if (pr->flags.ordered) {
2219 sh->u.s.ordered_iteration = 0;
2220 }
2221
2222 sh->buffer_index += __kmp_dispatch_num_buffers;
2223 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2224 gtid, sh->buffer_index));
2225
2226 KMP_MB(); /* Flush all pending memory write invalidates. */
2227
2228 } // if
2229 if (__kmp_env_consistency_check) {
2230 if (pr->pushed_ws != ct_none) {
2231 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2232 }
2233 }
2234
2235 th->th.th_dispatch->th_deo_fcn = NULL;
2236 th->th.th_dispatch->th_dxo_fcn = NULL;
2237 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2238 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2239 } // if (status == 0)
2240#if KMP_OS_WINDOWS
2241 else if (last) {
2242 pr->u.p.last_upper = pr->u.p.ub;
2243 }
2244#endif /* KMP_OS_WINDOWS */
2245 if (p_last != NULL && status != 0)
2246 *p_last = last;
2247 } // if
2248
2249#ifdef KMP_DEBUG
2250 {
2251 char *buff;
2252 // create format specifiers before the debug output
2253 buff = __kmp_str_format(
2254 "__kmp_dispatch_next: T#%%d normal case: "
2255 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2256 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2257 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2258 (p_last ? *p_last : 0), status));
2259 __kmp_str_free(&buff);
2260 }
2261#endif
2262#if INCLUDE_SSC_MARKS
2263 SSC_MARK_DISPATCH_NEXT();
2264#endif
2265 OMPT_LOOP_END;
2266 KMP_STATS_LOOP_END;
2267 return status;
2268}
2269
2270template <typename T>
2271static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2272 kmp_int32 *plastiter, T *plower, T *pupper,
2273 typename traits_t<T>::signed_t incr) {
2274 typedef typename traits_t<T>::unsigned_t UT;
2275 kmp_uint32 team_id;
2276 kmp_uint32 nteams;
2277 UT trip_count;
2278 kmp_team_t *team;
2279 kmp_info_t *th;
2280
2281 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2282 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2283#ifdef KMP_DEBUG
2284 typedef typename traits_t<T>::signed_t ST;
2285 {
2286 char *buff;
2287 // create format specifiers before the debug output
2288 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2289 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2290 traits_t<T>::spec, traits_t<T>::spec,
2291 traits_t<ST>::spec, traits_t<T>::spec);
2292 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2293 __kmp_str_free(&buff);
2294 }
2295#endif
2296
2297 if (__kmp_env_consistency_check) {
2298 if (incr == 0) {
2299 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2300 loc);
2301 }
2302 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2303 // The loop is illegal.
2304 // Some zero-trip loops maintained by compiler, e.g.:
2305 // for(i=10;i<0;++i) // lower >= upper - run-time check
2306 // for(i=0;i>10;--i) // lower <= upper - run-time check
2307 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2308 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2309 // Compiler does not check the following illegal loops:
2310 // for(i=0;i<10;i+=incr) // where incr<0
2311 // for(i=10;i>0;i-=incr) // where incr<0
2312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2313 }
2314 }
2315 __kmp_assert_valid_gtid(gtid);
2316 th = __kmp_threads[gtid];
2317 team = th->th.th_team;
2318 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2319 nteams = th->th.th_teams_size.nteams;
2320 team_id = team->t.t_master_tid;
2321 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2322
2323 // compute global trip count
2324 if (incr == 1) {
2325 trip_count = *pupper - *plower + 1;
2326 } else if (incr == -1) {
2327 trip_count = *plower - *pupper + 1;
2328 } else if (incr > 0) {
2329 // upper-lower can exceed the limit of signed type
2330 trip_count = (UT)(*pupper - *plower) / incr + 1;
2331 } else {
2332 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2333 }
2334
2335 if (trip_count <= nteams) {
2336 KMP_DEBUG_ASSERT(
2337 __kmp_static == kmp_sch_static_greedy ||
2338 __kmp_static ==
2339 kmp_sch_static_balanced); // Unknown static scheduling type.
2340 // only some teams get single iteration, others get nothing
2341 if (team_id < trip_count) {
2342 *pupper = *plower = *plower + team_id * incr;
2343 } else {
2344 *plower = *pupper + incr; // zero-trip loop
2345 }
2346 if (plastiter != NULL)
2347 *plastiter = (team_id == trip_count - 1);
2348 } else {
2349 if (__kmp_static == kmp_sch_static_balanced) {
2350 UT chunk = trip_count / nteams;
2351 UT extras = trip_count % nteams;
2352 *plower +=
2353 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2354 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2355 if (plastiter != NULL)
2356 *plastiter = (team_id == nteams - 1);
2357 } else {
2358 T chunk_inc_count =
2359 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2360 T upper = *pupper;
2361 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2362 // Unknown static scheduling type.
2363 *plower += team_id * chunk_inc_count;
2364 *pupper = *plower + chunk_inc_count - incr;
2365 // Check/correct bounds if needed
2366 if (incr > 0) {
2367 if (*pupper < *plower)
2368 *pupper = traits_t<T>::max_value;
2369 if (plastiter != NULL)
2370 *plastiter = *plower <= upper && *pupper > upper - incr;
2371 if (*pupper > upper)
2372 *pupper = upper; // tracker C73258
2373 } else {
2374 if (*pupper > *plower)
2375 *pupper = traits_t<T>::min_value;
2376 if (plastiter != NULL)
2377 *plastiter = *plower >= upper && *pupper < upper - incr;
2378 if (*pupper < upper)
2379 *pupper = upper; // tracker C73258
2380 }
2381 }
2382 }
2383}
2384
2385//-----------------------------------------------------------------------------
2386// Dispatch routines
2387// Transfer call to template< type T >
2388// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2389// T lb, T ub, ST st, ST chunk )
2390extern "C" {
2391
2408void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2409 enum sched_type schedule, kmp_int32 lb,
2410 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2411 KMP_DEBUG_ASSERT(__kmp_init_serial);
2412#if OMPT_SUPPORT && OMPT_OPTIONAL
2413 OMPT_STORE_RETURN_ADDRESS(gtid);
2414#endif
2415 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2416}
2420void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2421 enum sched_type schedule, kmp_uint32 lb,
2422 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2423 KMP_DEBUG_ASSERT(__kmp_init_serial);
2424#if OMPT_SUPPORT && OMPT_OPTIONAL
2425 OMPT_STORE_RETURN_ADDRESS(gtid);
2426#endif
2427 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2428}
2429
2433void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2434 enum sched_type schedule, kmp_int64 lb,
2435 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2436 KMP_DEBUG_ASSERT(__kmp_init_serial);
2437#if OMPT_SUPPORT && OMPT_OPTIONAL
2438 OMPT_STORE_RETURN_ADDRESS(gtid);
2439#endif
2440 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2441}
2442
2446void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2447 enum sched_type schedule, kmp_uint64 lb,
2448 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2449 KMP_DEBUG_ASSERT(__kmp_init_serial);
2450#if OMPT_SUPPORT && OMPT_OPTIONAL
2451 OMPT_STORE_RETURN_ADDRESS(gtid);
2452#endif
2453 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2454}
2455
2465void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2466 enum sched_type schedule, kmp_int32 *p_last,
2467 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2468 kmp_int32 chunk) {
2469 KMP_DEBUG_ASSERT(__kmp_init_serial);
2470#if OMPT_SUPPORT && OMPT_OPTIONAL
2471 OMPT_STORE_RETURN_ADDRESS(gtid);
2472#endif
2473 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2474 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2475}
2476
2477void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2478 enum sched_type schedule, kmp_int32 *p_last,
2479 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2480 kmp_int32 chunk) {
2481 KMP_DEBUG_ASSERT(__kmp_init_serial);
2482#if OMPT_SUPPORT && OMPT_OPTIONAL
2483 OMPT_STORE_RETURN_ADDRESS(gtid);
2484#endif
2485 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2486 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2487}
2488
2489void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2490 enum sched_type schedule, kmp_int32 *p_last,
2491 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2492 kmp_int64 chunk) {
2493 KMP_DEBUG_ASSERT(__kmp_init_serial);
2494#if OMPT_SUPPORT && OMPT_OPTIONAL
2495 OMPT_STORE_RETURN_ADDRESS(gtid);
2496#endif
2497 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2498 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2499}
2500
2501void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2502 enum sched_type schedule, kmp_int32 *p_last,
2503 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2504 kmp_int64 chunk) {
2505 KMP_DEBUG_ASSERT(__kmp_init_serial);
2506#if OMPT_SUPPORT && OMPT_OPTIONAL
2507 OMPT_STORE_RETURN_ADDRESS(gtid);
2508#endif
2509 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2510 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2511}
2512
2526int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2527 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2528#if OMPT_SUPPORT && OMPT_OPTIONAL
2529 OMPT_STORE_RETURN_ADDRESS(gtid);
2530#endif
2531 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2532#if OMPT_SUPPORT && OMPT_OPTIONAL
2533 ,
2534 OMPT_LOAD_RETURN_ADDRESS(gtid)
2535#endif
2536 );
2537}
2538
2542int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2543 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2544 kmp_int32 *p_st) {
2545#if OMPT_SUPPORT && OMPT_OPTIONAL
2546 OMPT_STORE_RETURN_ADDRESS(gtid);
2547#endif
2548 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2549#if OMPT_SUPPORT && OMPT_OPTIONAL
2550 ,
2551 OMPT_LOAD_RETURN_ADDRESS(gtid)
2552#endif
2553 );
2554}
2555
2559int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2560 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2561#if OMPT_SUPPORT && OMPT_OPTIONAL
2562 OMPT_STORE_RETURN_ADDRESS(gtid);
2563#endif
2564 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2565#if OMPT_SUPPORT && OMPT_OPTIONAL
2566 ,
2567 OMPT_LOAD_RETURN_ADDRESS(gtid)
2568#endif
2569 );
2570}
2571
2575int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2576 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2577 kmp_int64 *p_st) {
2578#if OMPT_SUPPORT && OMPT_OPTIONAL
2579 OMPT_STORE_RETURN_ADDRESS(gtid);
2580#endif
2581 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2582#if OMPT_SUPPORT && OMPT_OPTIONAL
2583 ,
2584 OMPT_LOAD_RETURN_ADDRESS(gtid)
2585#endif
2586 );
2587}
2588
2595void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2596 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2597}
2598
2602void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2603 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2604}
2605
2609void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2610 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2611}
2612
2616void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2617 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2618}
2621//-----------------------------------------------------------------------------
2622// Non-template routines from kmp_dispatch.cpp used in other sources
2623
2624kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2625 return value == checker;
2626}
2627
2628kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2629 return value != checker;
2630}
2631
2632kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2633 return value < checker;
2634}
2635
2636kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2637 return value >= checker;
2638}
2639
2640kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2641 return value <= checker;
2642}
2643
2644kmp_uint32
2645__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2646 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2647 void *obj // Higher-level synchronization object, or NULL.
2648) {
2649 // note: we may not belong to a team at this point
2650 volatile kmp_uint32 *spin = spinner;
2651 kmp_uint32 check = checker;
2652 kmp_uint32 spins;
2653 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2654 kmp_uint32 r;
2655
2656 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2657 KMP_INIT_YIELD(spins);
2658 // main wait spin loop
2659 while (!f(r = TCR_4(*spin), check)) {
2660 KMP_FSYNC_SPIN_PREPARE(obj);
2661 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2662 split. It causes problems with infinite recursion because of exit lock */
2663 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2664 __kmp_abort_thread(); */
2665 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2666 }
2667 KMP_FSYNC_SPIN_ACQUIRED(obj);
2668 return r;
2669}
2670
2671void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2672 kmp_uint32 (*pred)(void *, kmp_uint32),
2673 void *obj // Higher-level synchronization object, or NULL.
2674) {
2675 // note: we may not belong to a team at this point
2676 void *spin = spinner;
2677 kmp_uint32 check = checker;
2678 kmp_uint32 spins;
2679 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2680
2681 KMP_FSYNC_SPIN_INIT(obj, spin);
2682 KMP_INIT_YIELD(spins);
2683 // main wait spin loop
2684 while (!f(spin, check)) {
2685 KMP_FSYNC_SPIN_PREPARE(obj);
2686 /* if we have waited a bit, or are noversubscribed, yield */
2687 /* pause is in the following code */
2688 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2689 }
2690 KMP_FSYNC_SPIN_ACQUIRED(obj);
2691}
2692
2693} // extern "C"
2694
2695#ifdef KMP_GOMP_COMPAT
2696
2697void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2698 enum sched_type schedule, kmp_int32 lb,
2699 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2700 int push_ws) {
2701 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2702 push_ws);
2703}
2704
2705void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2706 enum sched_type schedule, kmp_uint32 lb,
2707 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2708 int push_ws) {
2709 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2710 push_ws);
2711}
2712
2713void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2714 enum sched_type schedule, kmp_int64 lb,
2715 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2716 int push_ws) {
2717 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2718 push_ws);
2719}
2720
2721void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2722 enum sched_type schedule, kmp_uint64 lb,
2723 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2724 int push_ws) {
2725 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2726 push_ws);
2727}
2728
2729void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2730 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2731}
2732
2733void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2734 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2735}
2736
2737void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2738 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2739}
2740
2741void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2742 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2743}
2744
2745#endif /* KMP_GOMP_COMPAT */
2746
2747/* ------------------------------------------------------------------------ */
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition kmp_stats.h:891
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition kmp_stats.h:904
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
sched_type
Definition kmp.h:355
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)
@ kmp_sch_runtime_simd
Definition kmp.h:377
@ kmp_sch_auto
Definition kmp.h:362
@ kmp_sch_static
Definition kmp.h:358
@ kmp_sch_guided_simd
Definition kmp.h:376
@ kmp_sch_guided_chunked
Definition kmp.h:360
@ kmp_sch_lower
Definition kmp.h:356
@ kmp_nm_upper
Definition kmp.h:427
@ kmp_ord_lower
Definition kmp.h:382
@ kmp_sch_upper
Definition kmp.h:380
@ kmp_nm_lower
Definition kmp.h:400
Definition kmp.h:233