22 #include "kmp_error.h" 25 #include "kmp_stats.h" 27 #if KMP_USE_X87CONTROL 31 #include "kmp_dispatch.h" 32 #if KMP_USE_HIER_SCHED 33 #include "kmp_dispatch_hier.h" 37 #include "ompt-specific.h" 43 void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
46 KMP_DEBUG_ASSERT(gtid_ref);
48 if (__kmp_env_consistency_check) {
49 th = __kmp_threads[*gtid_ref];
50 if (th->th.th_root->r.r_active &&
51 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
52 #if KMP_USE_DYNAMIC_LOCK 53 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
55 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
61 void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
64 if (__kmp_env_consistency_check) {
65 th = __kmp_threads[*gtid_ref];
66 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
67 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
83 void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
84 dispatch_private_info_template<T> *pr,
86 typename traits_t<T>::signed_t st,
88 kmp_uint64 *cur_chunk,
90 typename traits_t<T>::signed_t chunk,
92 typedef typename traits_t<T>::unsigned_t UT;
93 typedef typename traits_t<T>::floating_t DBL;
101 typedef typename traits_t<T>::signed_t ST;
105 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called " 106 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 107 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
108 traits_t<T>::spec, traits_t<T>::spec,
109 traits_t<ST>::spec, traits_t<ST>::spec,
110 traits_t<T>::spec, traits_t<T>::spec);
111 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
112 __kmp_str_free(&buff);
116 th = __kmp_threads[gtid];
117 team = th->th.th_team;
118 active = !team->t.t_serialized;
121 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
122 __kmp_forkjoin_frames_mode == 3 &&
123 KMP_MASTER_GTID(gtid) &&
125 th->th.th_teams_microtask == NULL &&
127 team->t.t_active_level == 1;
129 #if (KMP_STATIC_STEAL_ENABLED) 130 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
132 schedule = kmp_sch_static_steal;
135 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
139 pr->flags.nomerge = TRUE;
143 pr->flags.nomerge = FALSE;
145 pr->type_size = traits_t<T>::type_size;
147 pr->flags.ordered = TRUE;
151 pr->flags.ordered = FALSE;
155 schedule = __kmp_static;
157 if (schedule == kmp_sch_runtime) {
160 schedule = team->t.t_sched.r_sched_type;
164 schedule = __kmp_guided;
166 schedule = __kmp_static;
170 chunk = team->t.t_sched.chunk;
179 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: " 180 "schedule:%%d chunk:%%%s\n",
182 KD_TRACE(10, (buff, gtid, schedule, chunk));
183 __kmp_str_free(&buff);
188 schedule = __kmp_guided;
191 chunk = KMP_DEFAULT_CHUNK;
197 schedule = __kmp_auto;
202 buff = __kmp_str_format(
203 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 204 "schedule:%%d chunk:%%%s\n",
206 KD_TRACE(10, (buff, gtid, schedule, chunk));
207 __kmp_str_free(&buff);
213 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
214 schedule = kmp_sch_guided_iterative_chunked;
215 KMP_WARNING(DispatchManyThreads);
218 if (schedule == kmp_sch_runtime_simd) {
220 schedule = team->t.t_sched.r_sched_type;
224 schedule == __kmp_static) {
225 schedule = kmp_sch_static_balanced_chunked;
228 schedule = kmp_sch_guided_simd;
230 chunk = team->t.t_sched.chunk * chunk;
240 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d new: schedule:%%d" 243 KD_TRACE(10, (buff, gtid, schedule, chunk));
244 __kmp_str_free(&buff);
248 #endif // OMP_45_ENABLED 249 pr->u.p.parm1 = chunk;
252 "unknown scheduling type");
256 if (__kmp_env_consistency_check) {
258 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
259 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
273 tc = (UT)(lb - ub) / (-st) + 1;
281 tc = (UT)(ub - lb) / st + 1;
293 pr->u.p.last_upper = ub + st;
299 if (pr->flags.ordered) {
300 pr->ordered_bumped = 0;
301 pr->u.p.ordered_lower = 1;
302 pr->u.p.ordered_upper = 0;
307 #if (KMP_STATIC_STEAL_ENABLED) 308 case kmp_sch_static_steal: {
312 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
315 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
316 if (nproc > 1 && ntc >= nproc) {
319 T small_chunk, extras;
321 small_chunk = ntc / nproc;
322 extras = ntc % nproc;
324 init =
id * small_chunk + (
id < extras ? id : extras);
325 pr->u.p.count = init;
326 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0);
330 pr->u.p.parm4 = (
id + 1) % nproc;
332 if (traits_t<T>::type_size > 4) {
338 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
339 th->th.th_dispatch->th_steal_lock =
340 (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
341 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
345 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 346 "kmp_sch_static_balanced\n",
348 schedule = kmp_sch_static_balanced;
354 case kmp_sch_static_balanced: {
359 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
369 pr->u.p.parm1 = (
id == tc - 1);
372 pr->u.p.parm1 = FALSE;
376 T small_chunk = tc / nproc;
377 T extras = tc % nproc;
378 init =
id * small_chunk + (
id < extras ? id : extras);
379 limit = init + small_chunk - (
id < extras ? 0 : 1);
380 pr->u.p.parm1 = (
id == nproc - 1);
386 pr->u.p.parm1 = TRUE;
390 pr->u.p.parm1 = FALSE;
396 if (itt_need_metadata_reporting)
398 *cur_chunk = limit - init + 1;
401 pr->u.p.lb = lb + init;
402 pr->u.p.ub = lb + limit;
405 T ub_tmp = lb + limit * st;
406 pr->u.p.lb = lb + init * st;
410 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
412 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
415 if (pr->flags.ordered) {
416 pr->u.p.ordered_lower = init;
417 pr->u.p.ordered_upper = limit;
422 case kmp_sch_static_balanced_chunked: {
425 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 426 " -> falling-through to static_greedy\n",
428 schedule = kmp_sch_static_greedy;
430 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
435 case kmp_sch_guided_simd:
436 #endif // OMP_45_ENABLED 437 case kmp_sch_guided_iterative_chunked: {
440 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 445 if ((2L * chunk + 1) * nproc >= tc) {
447 schedule = kmp_sch_dynamic_chunked;
450 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
451 *(
double *)&pr->u.p.parm3 =
452 guided_flt_param / nproc;
455 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 456 "kmp_sch_static_greedy\n",
458 schedule = kmp_sch_static_greedy;
462 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
468 case kmp_sch_guided_analytical_chunked: {
469 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 470 "kmp_sch_guided_analytical_chunked case\n",
474 if ((2L * chunk + 1) * nproc >= tc) {
476 schedule = kmp_sch_dynamic_chunked;
481 #if KMP_USE_X87CONTROL 491 unsigned int oldFpcw = _control87(0, 0);
492 _control87(_PC_64, _MCW_PC);
495 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
502 x = (
long double)1.0 - (
long double)0.5 / nproc;
513 ptrdiff_t natural_alignment =
514 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
518 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
523 *(DBL *)&pr->u.p.parm3 = x;
536 p = __kmp_pow<UT>(x, right);
541 }
while (p > target && right < (1 << 27));
549 while (left + 1 < right) {
550 mid = (left + right) / 2;
551 if (__kmp_pow<UT>(x, mid) > target) {
560 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
561 __kmp_pow<UT>(x, cross) <= target);
564 pr->u.p.parm2 = cross;
567 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 568 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 570 #define GUIDED_ANALYTICAL_WORKAROUND (x) 573 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
574 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
576 #if KMP_USE_X87CONTROL 578 _control87(oldFpcw, _MCW_PC);
582 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 583 "kmp_sch_static_greedy\n",
585 schedule = kmp_sch_static_greedy;
591 case kmp_sch_static_greedy:
594 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
596 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
598 case kmp_sch_static_chunked:
599 case kmp_sch_dynamic_chunked:
600 if (pr->u.p.parm1 <= 0) {
601 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
603 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 604 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
607 case kmp_sch_trapezoidal: {
610 T parm1, parm2, parm3, parm4;
612 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
618 parm2 = (tc / (2 * nproc));
628 }
else if (parm1 > parm2) {
633 parm3 = (parm2 + parm1);
634 parm3 = (2 * tc + parm3 - 1) / parm3;
642 parm4 = (parm2 - parm1) / parm4;
649 pr->u.p.parm1 = parm1;
650 pr->u.p.parm2 = parm2;
651 pr->u.p.parm3 = parm3;
652 pr->u.p.parm4 = parm4;
657 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
658 KMP_HNT(GetNewerLibrary),
663 pr->schedule = schedule;
666 #if KMP_USE_HIER_SCHED 667 template <
typename T>
668 inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
669 typename traits_t<T>::signed_t st);
672 __kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
673 kmp_int32 ub, kmp_int32 st) {
674 __kmp_dispatch_init_hierarchy<kmp_int32>(
675 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
676 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
680 __kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
681 kmp_uint32 ub, kmp_int32 st) {
682 __kmp_dispatch_init_hierarchy<kmp_uint32>(
683 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
684 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
688 __kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
689 kmp_int64 ub, kmp_int64 st) {
690 __kmp_dispatch_init_hierarchy<kmp_int64>(
691 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
692 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
696 __kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
697 kmp_uint64 ub, kmp_int64 st) {
698 __kmp_dispatch_init_hierarchy<kmp_uint64>(
699 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
700 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
704 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
705 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
706 for (
int i = 0; i < num_disp_buff; ++i) {
709 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
710 &team->t.t_disp_buffer[i]);
712 sh->hier->deallocate();
713 __kmp_free(sh->hier);
721 template <
typename T>
724 T ub,
typename traits_t<T>::signed_t st,
725 typename traits_t<T>::signed_t chunk,
int push_ws) {
726 typedef typename traits_t<T>::unsigned_t UT;
731 kmp_uint32 my_buffer_index;
732 dispatch_private_info_template<T> *pr;
733 dispatch_shared_info_template<T>
volatile *sh;
735 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
736 sizeof(dispatch_private_info));
737 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
738 sizeof(dispatch_shared_info));
740 if (!TCR_4(__kmp_init_parallel))
741 __kmp_parallel_initialize();
743 #if INCLUDE_SSC_MARKS 744 SSC_MARK_DISPATCH_INIT();
747 typedef typename traits_t<T>::signed_t ST;
751 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d " 752 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
753 traits_t<ST>::spec, traits_t<T>::spec,
754 traits_t<T>::spec, traits_t<ST>::spec);
755 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
756 __kmp_str_free(&buff);
760 th = __kmp_threads[gtid];
761 team = th->th.th_team;
762 active = !team->t.t_serialized;
763 th->th.th_ident = loc;
768 if (schedule == __kmp_static) {
774 #if KMP_USE_HIER_SCHED 780 my_buffer_index = th->th.th_dispatch->th_disp_index;
781 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
783 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
784 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
789 if (pr->flags.use_hier) {
791 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. " 792 "Disabling hierarchical scheduling.\n",
794 pr->flags.use_hier = FALSE;
797 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
800 if (!ordered && !pr->flags.use_hier)
801 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
803 #endif // KMP_USE_HIER_SCHED 806 kmp_uint64 cur_chunk = chunk;
807 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
808 __kmp_forkjoin_frames_mode == 3 &&
809 KMP_MASTER_GTID(gtid) &&
811 th->th.th_teams_microtask == NULL &&
813 team->t.t_active_level == 1;
816 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
817 th->th.th_dispatch->th_disp_buffer);
819 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
820 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
822 my_buffer_index = th->th.th_dispatch->th_disp_index++;
825 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
827 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
828 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
829 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
830 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
834 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
838 chunk, (T)th->th.th_team_nproc,
839 (T)th->th.th_info.ds.ds_tid);
841 if (pr->flags.ordered == 0) {
842 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
843 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
845 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
846 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
854 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 855 "sh->buffer_index:%d\n",
856 gtid, my_buffer_index, sh->buffer_index));
857 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
858 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
862 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 863 "sh->buffer_index:%d\n",
864 gtid, my_buffer_index, sh->buffer_index));
866 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
867 th->th.th_dispatch->th_dispatch_sh_current =
868 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
870 if (pr->flags.ordered) {
871 __kmp_itt_ordered_init(gtid);
874 if (itt_need_metadata_reporting) {
876 kmp_uint64 schedtype = 0;
878 case kmp_sch_static_chunked:
879 case kmp_sch_static_balanced:
881 case kmp_sch_static_greedy:
882 cur_chunk = pr->u.p.parm1;
884 case kmp_sch_dynamic_chunked:
887 case kmp_sch_guided_iterative_chunked:
888 case kmp_sch_guided_analytical_chunked:
890 case kmp_sch_guided_simd:
900 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
902 #if KMP_USE_HIER_SCHED 903 if (pr->flags.use_hier) {
905 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
907 #endif // KMP_USER_HIER_SCHED 915 buff = __kmp_str_format(
916 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 918 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 919 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
920 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
921 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
922 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
923 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
924 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
925 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
926 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
927 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
928 __kmp_str_free(&buff);
931 #if (KMP_STATIC_STEAL_ENABLED) 937 if (schedule == kmp_sch_static_steal) {
941 volatile T *p = &pr->u.p.static_steal_counter;
944 #endif // ( KMP_STATIC_STEAL_ENABLED ) 946 #if OMPT_SUPPORT && OMPT_OPTIONAL 947 if (ompt_enabled.ompt_callback_work) {
948 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
949 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
950 ompt_callbacks.ompt_callback(ompt_callback_work)(
951 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
952 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
955 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
963 template <
typename UT>
964 static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
965 typedef typename traits_t<UT>::signed_t ST;
966 kmp_info_t *th = __kmp_threads[gtid];
968 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
969 if (!th->th.th_team->t.t_serialized) {
971 dispatch_private_info_template<UT> *pr =
972 reinterpret_cast<dispatch_private_info_template<UT> *
>(
973 th->th.th_dispatch->th_dispatch_pr_current);
974 dispatch_shared_info_template<UT>
volatile *sh =
975 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
976 th->th.th_dispatch->th_dispatch_sh_current);
977 KMP_DEBUG_ASSERT(pr);
978 KMP_DEBUG_ASSERT(sh);
979 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
980 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
982 if (pr->ordered_bumped) {
985 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
987 pr->ordered_bumped = 0;
989 UT lower = pr->u.p.ordered_lower;
995 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: " 996 "ordered_iteration:%%%s lower:%%%s\n",
997 traits_t<UT>::spec, traits_t<UT>::spec);
998 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
999 __kmp_str_free(&buff);
1003 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1004 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1010 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: " 1011 "ordered_iteration:%%%s lower:%%%s\n",
1012 traits_t<UT>::spec, traits_t<UT>::spec);
1013 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1014 __kmp_str_free(&buff);
1018 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1021 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1024 #ifdef KMP_GOMP_COMPAT 1026 template <
typename UT>
1027 static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1028 typedef typename traits_t<UT>::signed_t ST;
1029 kmp_info_t *th = __kmp_threads[gtid];
1031 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1032 if (!th->th.th_team->t.t_serialized) {
1034 dispatch_private_info_template<UT> *pr =
1035 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1036 th->th.th_dispatch->th_dispatch_pr_current);
1037 dispatch_shared_info_template<UT>
volatile *sh =
1038 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1039 th->th.th_dispatch->th_dispatch_sh_current);
1040 KMP_DEBUG_ASSERT(pr);
1041 KMP_DEBUG_ASSERT(sh);
1042 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1043 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1046 UT lower = pr->u.p.ordered_lower;
1047 UT upper = pr->u.p.ordered_upper;
1048 UT inc = upper - lower + 1;
1050 if (pr->ordered_bumped == inc) {
1053 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1055 pr->ordered_bumped = 0;
1057 inc -= pr->ordered_bumped;
1063 buff = __kmp_str_format(
1064 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1065 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1066 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1067 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1068 __kmp_str_free(&buff);
1072 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1073 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1076 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting " 1077 "ordered_bumped to zero\n",
1079 pr->ordered_bumped = 0;
1085 buff = __kmp_str_format(
1086 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1087 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1088 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1089 traits_t<UT>::spec);
1091 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1092 __kmp_str_free(&buff);
1096 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1100 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1105 template <
typename T>
1106 int __kmp_dispatch_next_algorithm(
int gtid,
1107 dispatch_private_info_template<T> *pr,
1108 dispatch_shared_info_template<T>
volatile *sh,
1109 kmp_int32 *p_last, T *p_lb, T *p_ub,
1110 typename traits_t<T>::signed_t *p_st, T nproc,
1112 typedef typename traits_t<T>::unsigned_t UT;
1113 typedef typename traits_t<T>::signed_t ST;
1114 typedef typename traits_t<T>::floating_t DBL;
1119 UT limit, trip, init;
1120 kmp_info_t *th = __kmp_threads[gtid];
1121 kmp_team_t *team = th->th.th_team;
1123 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1124 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1125 KMP_DEBUG_ASSERT(pr);
1126 KMP_DEBUG_ASSERT(sh);
1127 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1133 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1134 "sh:%%p nproc:%%%s tid:%%%s\n",
1135 traits_t<T>::spec, traits_t<T>::spec);
1136 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1137 __kmp_str_free(&buff);
1142 if (pr->u.p.tc == 0) {
1144 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1150 switch (pr->schedule) {
1151 #if (KMP_STATIC_STEAL_ENABLED) 1152 case kmp_sch_static_steal: {
1153 T chunk = pr->u.p.parm1;
1156 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1159 trip = pr->u.p.tc - 1;
1161 if (traits_t<T>::type_size > 4) {
1164 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1165 KMP_DEBUG_ASSERT(lck != NULL);
1166 if (pr->u.p.count < (UT)pr->u.p.ub) {
1167 __kmp_acquire_lock(lck, gtid);
1169 init = (pr->u.p.count)++;
1170 status = (init < (UT)pr->u.p.ub);
1171 __kmp_release_lock(lck, gtid);
1176 kmp_info_t **other_threads = team->t.t_threads;
1177 int while_limit = nproc;
1178 int while_index = 0;
1181 while ((!status) && (while_limit != ++while_index)) {
1183 T victimIdx = pr->u.p.parm4;
1184 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1185 dispatch_private_info_template<T> *victim =
1186 reinterpret_cast<dispatch_private_info_template<T> *
>(
1187 other_threads[victimIdx]
1188 ->th.th_dispatch->th_dispatch_pr_current);
1189 while ((victim == NULL || victim == pr ||
1190 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1191 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1192 oldVictimIdx != victimIdx) {
1193 victimIdx = (victimIdx + 1) % nproc;
1194 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1195 other_threads[victimIdx]
1196 ->th.th_dispatch->th_dispatch_pr_current);
1198 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1199 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1204 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1205 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1209 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1210 KMP_ASSERT(lck != NULL);
1211 __kmp_acquire_lock(lck, gtid);
1212 limit = victim->u.p.ub;
1213 if (victim->u.p.count >= limit ||
1214 (remaining = limit - victim->u.p.count) < 2) {
1215 __kmp_release_lock(lck, gtid);
1216 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1221 if (remaining > 3) {
1223 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1224 init = (victim->u.p.ub -= (remaining >> 2));
1227 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1228 init = (victim->u.p.ub -= 1);
1230 __kmp_release_lock(lck, gtid);
1232 KMP_DEBUG_ASSERT(init + 1 <= limit);
1233 pr->u.p.parm4 = victimIdx;
1237 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1238 pr->u.p.count = init + 1;
1240 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1255 union_i4 vold, vnew;
1256 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1259 while (!KMP_COMPARE_AND_STORE_ACQ64(
1260 (
volatile kmp_int64 *)&pr->u.p.count,
1261 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1262 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1264 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1269 init = vnew.p.count;
1270 status = (init < (UT)vnew.p.ub);
1274 kmp_info_t **other_threads = team->t.t_threads;
1275 int while_limit = nproc;
1276 int while_index = 0;
1280 while ((!status) && (while_limit != ++while_index)) {
1281 union_i4 vold, vnew;
1282 kmp_int32 remaining;
1283 T victimIdx = pr->u.p.parm4;
1284 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1285 dispatch_private_info_template<T> *victim =
1286 reinterpret_cast<dispatch_private_info_template<T> *
>(
1287 other_threads[victimIdx]
1288 ->th.th_dispatch->th_dispatch_pr_current);
1289 while ((victim == NULL || victim == pr ||
1290 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1291 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1292 oldVictimIdx != victimIdx) {
1293 victimIdx = (victimIdx + 1) % nproc;
1294 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1295 other_threads[victimIdx]
1296 ->th.th_dispatch->th_dispatch_pr_current);
1298 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1299 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1304 pr->u.p.parm4 = victimIdx;
1306 vold.b = *(
volatile kmp_int64 *)(&victim->u.p.count);
1309 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1310 if (vnew.p.count >= (UT)vnew.p.ub ||
1311 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1312 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1315 if (remaining > 3) {
1316 vnew.p.ub -= (remaining >> 2);
1320 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1322 if (KMP_COMPARE_AND_STORE_ACQ64(
1323 (
volatile kmp_int64 *)&victim->u.p.count,
1324 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1325 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1327 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1328 vold.p.ub - vnew.p.ub);
1333 vold.p.count = init + 1;
1335 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1337 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1352 start = pr->u.p.parm2;
1354 limit = chunk + init - 1;
1356 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1358 KMP_DEBUG_ASSERT(init <= trip);
1359 if ((last = (limit >= trip)) != 0)
1365 *p_lb = start + init;
1366 *p_ub = start + limit;
1368 *p_lb = start + init * incr;
1369 *p_ub = start + limit * incr;
1372 if (pr->flags.ordered) {
1373 pr->u.p.ordered_lower = init;
1374 pr->u.p.ordered_upper = limit;
1379 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1380 case kmp_sch_static_balanced: {
1383 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1386 if ((status = !pr->u.p.count) != 0) {
1390 last = pr->u.p.parm1;
1394 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1398 case kmp_sch_static_greedy:
1400 case kmp_sch_static_chunked: {
1403 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1404 "kmp_sch_static_[affinity|chunked] case\n",
1406 parm1 = pr->u.p.parm1;
1408 trip = pr->u.p.tc - 1;
1409 init = parm1 * (pr->u.p.count + tid);
1411 if ((status = (init <= trip)) != 0) {
1414 limit = parm1 + init - 1;
1416 if ((last = (limit >= trip)) != 0)
1422 pr->u.p.count += nproc;
1425 *p_lb = start + init;
1426 *p_ub = start + limit;
1428 *p_lb = start + init * incr;
1429 *p_ub = start + limit * incr;
1432 if (pr->flags.ordered) {
1433 pr->u.p.ordered_lower = init;
1434 pr->u.p.ordered_upper = limit;
1440 case kmp_sch_dynamic_chunked: {
1441 T chunk = pr->u.p.parm1;
1445 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1448 init = chunk * test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1449 trip = pr->u.p.tc - 1;
1451 if ((status = (init <= trip)) == 0) {
1458 limit = chunk + init - 1;
1461 if ((last = (limit >= trip)) != 0)
1468 *p_lb = start + init;
1469 *p_ub = start + limit;
1471 *p_lb = start + init * incr;
1472 *p_ub = start + limit * incr;
1475 if (pr->flags.ordered) {
1476 pr->u.p.ordered_lower = init;
1477 pr->u.p.ordered_upper = limit;
1483 case kmp_sch_guided_iterative_chunked: {
1484 T chunkspec = pr->u.p.parm1;
1485 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1492 init = sh->u.s.iteration;
1493 remaining = trip - init;
1494 if (remaining <= 0) {
1503 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1505 remaining = trip - init;
1506 if (remaining <= 0) {
1511 if ((T)remaining > chunkspec) {
1512 limit = init + chunkspec - 1;
1515 limit = init + remaining - 1;
1521 (UT)(remaining * *(
double *)&pr->u.p.parm3);
1522 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1523 (ST)init, (ST)limit)) {
1535 *p_lb = start + init * incr;
1536 *p_ub = start + limit * incr;
1537 if (pr->flags.ordered) {
1538 pr->u.p.ordered_lower = init;
1539 pr->u.p.ordered_upper = limit;
1551 case kmp_sch_guided_simd: {
1554 T chunk = pr->u.p.parm1;
1556 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1562 init = sh->u.s.iteration;
1563 remaining = trip - init;
1564 if (remaining <= 0) {
1568 KMP_DEBUG_ASSERT(init % chunk == 0);
1570 if ((T)remaining < pr->u.p.parm2) {
1573 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1575 remaining = trip - init;
1576 if (remaining <= 0) {
1581 if ((T)remaining > chunk) {
1582 limit = init + chunk - 1;
1585 limit = init + remaining - 1;
1591 UT span = remaining * (*(
double *)&pr->u.p.parm3);
1592 UT rem = span % chunk;
1594 span += chunk - rem;
1595 limit = init + span;
1596 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1597 (ST)init, (ST)limit)) {
1609 *p_lb = start + init * incr;
1610 *p_ub = start + limit * incr;
1611 if (pr->flags.ordered) {
1612 pr->u.p.ordered_lower = init;
1613 pr->u.p.ordered_upper = limit;
1623 #endif // OMP_45_ENABLED 1625 case kmp_sch_guided_analytical_chunked: {
1626 T chunkspec = pr->u.p.parm1;
1628 #if KMP_USE_X87CONTROL 1631 unsigned int oldFpcw;
1632 unsigned int fpcwSet = 0;
1634 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1635 "kmp_sch_guided_analytical_chunked case\n",
1640 KMP_DEBUG_ASSERT(nproc > 1);
1641 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1645 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1646 if (chunkIdx >= (UT)pr->u.p.parm2) {
1649 init = chunkIdx * chunkspec + pr->u.p.count;
1652 if ((status = (init > 0 && init <= trip)) != 0) {
1653 limit = init + chunkspec - 1;
1655 if ((last = (limit >= trip)) != 0)
1665 #if KMP_USE_X87CONTROL 1670 oldFpcw = _control87(0, 0);
1671 _control87(_PC_64, _MCW_PC);
1676 init = __kmp_dispatch_guided_remaining<T>(
1677 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1678 KMP_DEBUG_ASSERT(init);
1682 limit = trip - __kmp_dispatch_guided_remaining<T>(
1683 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1684 KMP_ASSERT(init <= limit);
1686 KMP_DEBUG_ASSERT(limit <= trip);
1693 #if KMP_USE_X87CONTROL 1697 if (fpcwSet && (oldFpcw & fpcwSet))
1698 _control87(oldFpcw, _MCW_PC);
1705 *p_lb = start + init * incr;
1706 *p_ub = start + limit * incr;
1707 if (pr->flags.ordered) {
1708 pr->u.p.ordered_lower = init;
1709 pr->u.p.ordered_upper = limit;
1720 case kmp_sch_trapezoidal: {
1722 T parm2 = pr->u.p.parm2;
1723 T parm3 = pr->u.p.parm3;
1724 T parm4 = pr->u.p.parm4;
1726 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1729 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
1731 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1732 trip = pr->u.p.tc - 1;
1734 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1741 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1744 if ((last = (limit >= trip)) != 0)
1751 *p_lb = start + init;
1752 *p_ub = start + limit;
1754 *p_lb = start + init * incr;
1755 *p_ub = start + limit * incr;
1758 if (pr->flags.ordered) {
1759 pr->u.p.ordered_lower = init;
1760 pr->u.p.ordered_upper = limit;
1767 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
1768 KMP_HNT(GetNewerLibrary),
1776 if (pr->flags.ordered) {
1779 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d " 1780 "ordered_lower:%%%s ordered_upper:%%%s\n",
1781 traits_t<UT>::spec, traits_t<UT>::spec);
1782 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1783 __kmp_str_free(&buff);
1788 buff = __kmp_str_format(
1789 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1790 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1791 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1792 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1793 __kmp_str_free(&buff);
1802 #if OMPT_SUPPORT && OMPT_OPTIONAL 1803 #define OMPT_LOOP_END \ 1804 if (status == 0) { \ 1805 if (ompt_enabled.ompt_callback_work) { \ 1806 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1807 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1808 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1809 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1810 &(task_info->task_data), 0, codeptr); \ 1815 #define OMPT_LOOP_END // no-op 1818 #if KMP_STATS_ENABLED 1819 #define KMP_STATS_LOOP_END \ 1821 kmp_int64 u, l, t, i; \ 1822 l = (kmp_int64)(*p_lb); \ 1823 u = (kmp_int64)(*p_ub); \ 1824 i = (kmp_int64)(pr->u.p.st); \ 1825 if (status == 0) { \ 1827 KMP_POP_PARTITIONED_TIMER(); \ 1828 } else if (i == 1) { \ 1833 } else if (i < 0) { \ 1835 t = (l - u) / (-i) + 1; \ 1840 t = (u - l) / i + 1; \ 1844 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1847 #define KMP_STATS_LOOP_END 1850 template <
typename T>
1851 static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
1853 typename traits_t<T>::signed_t *p_st
1854 #
if OMPT_SUPPORT && OMPT_OPTIONAL
1860 typedef typename traits_t<T>::unsigned_t UT;
1861 typedef typename traits_t<T>::signed_t ST;
1866 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1869 dispatch_private_info_template<T> *pr;
1870 kmp_info_t *th = __kmp_threads[gtid];
1871 kmp_team_t *team = th->th.th_team;
1873 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
1876 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1877 gtid, p_lb, p_ub, p_st, p_last));
1879 if (team->t.t_serialized) {
1881 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1882 th->th.th_dispatch->th_disp_buffer);
1883 KMP_DEBUG_ASSERT(pr);
1885 if ((status = (pr->u.p.tc != 0)) == 0) {
1892 if (__kmp_env_consistency_check) {
1893 if (pr->pushed_ws != ct_none) {
1894 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1897 }
else if (pr->flags.nomerge) {
1900 UT limit, trip, init;
1902 T chunk = pr->u.p.parm1;
1904 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1907 init = chunk * pr->u.p.count++;
1908 trip = pr->u.p.tc - 1;
1910 if ((status = (init <= trip)) == 0) {
1917 if (__kmp_env_consistency_check) {
1918 if (pr->pushed_ws != ct_none) {
1919 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1924 limit = chunk + init - 1;
1927 if ((last = (limit >= trip)) != 0) {
1930 pr->u.p.last_upper = pr->u.p.ub;
1938 *p_lb = start + init;
1939 *p_ub = start + limit;
1941 *p_lb = start + init * incr;
1942 *p_ub = start + limit * incr;
1945 if (pr->flags.ordered) {
1946 pr->u.p.ordered_lower = init;
1947 pr->u.p.ordered_upper = limit;
1952 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d " 1953 "ordered_lower:%%%s ordered_upper:%%%s\n",
1954 traits_t<UT>::spec, traits_t<UT>::spec);
1955 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1956 pr->u.p.ordered_upper));
1957 __kmp_str_free(&buff);
1967 pr->u.p.last_upper = *p_ub;
1978 buff = __kmp_str_format(
1979 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1980 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1981 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1982 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1983 __kmp_str_free(&buff);
1986 #if INCLUDE_SSC_MARKS 1987 SSC_MARK_DISPATCH_NEXT();
1994 dispatch_shared_info_template<T>
volatile *sh;
1996 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1997 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1999 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2000 th->th.th_dispatch->th_dispatch_pr_current);
2001 KMP_DEBUG_ASSERT(pr);
2002 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2003 th->th.th_dispatch->th_dispatch_sh_current);
2004 KMP_DEBUG_ASSERT(sh);
2006 #if KMP_USE_HIER_SCHED 2007 if (pr->flags.use_hier)
2008 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2010 #endif // KMP_USE_HIER_SCHED 2011 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2012 p_st, th->th.th_team_nproc,
2013 th->th.th_info.ds.ds_tid);
2018 num_done = test_then_inc<ST>((
volatile ST *)&sh->u.s.num_done);
2023 buff = __kmp_str_format(
2024 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2025 traits_t<UT>::spec);
2026 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2027 __kmp_str_free(&buff);
2031 #if KMP_USE_HIER_SCHED 2032 pr->flags.use_hier = FALSE;
2034 if ((ST)num_done == th->th.th_team_nproc - 1) {
2035 #if (KMP_STATIC_STEAL_ENABLED) 2036 if (pr->schedule == kmp_sch_static_steal &&
2037 traits_t<T>::type_size > 4) {
2039 kmp_info_t **other_threads = team->t.t_threads;
2041 for (i = 0; i < th->th.th_team_nproc; ++i) {
2042 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2043 KMP_ASSERT(lck != NULL);
2044 __kmp_destroy_lock(lck);
2046 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2054 sh->u.s.num_done = 0;
2055 sh->u.s.iteration = 0;
2058 if (pr->flags.ordered) {
2059 sh->u.s.ordered_iteration = 0;
2064 sh->buffer_index += __kmp_dispatch_num_buffers;
2065 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2066 gtid, sh->buffer_index));
2071 if (__kmp_env_consistency_check) {
2072 if (pr->pushed_ws != ct_none) {
2073 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2077 th->th.th_dispatch->th_deo_fcn = NULL;
2078 th->th.th_dispatch->th_dxo_fcn = NULL;
2079 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2080 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2084 pr->u.p.last_upper = pr->u.p.ub;
2087 if (p_last != NULL && status != 0)
2095 buff = __kmp_str_format(
2096 "__kmp_dispatch_next: T#%%d normal case: " 2097 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2098 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2099 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2100 (p_last ? *p_last : 0), status));
2101 __kmp_str_free(&buff);
2104 #if INCLUDE_SSC_MARKS 2105 SSC_MARK_DISPATCH_NEXT();
2112 template <
typename T>
2113 static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2114 kmp_int32 *plastiter, T *plower, T *pupper,
2115 typename traits_t<T>::signed_t incr) {
2116 typedef typename traits_t<T>::unsigned_t UT;
2123 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2124 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2126 typedef typename traits_t<T>::signed_t ST;
2130 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d " 2131 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2132 traits_t<T>::spec, traits_t<T>::spec,
2133 traits_t<ST>::spec, traits_t<T>::spec);
2134 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2135 __kmp_str_free(&buff);
2139 if (__kmp_env_consistency_check) {
2141 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2144 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2154 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2157 th = __kmp_threads[gtid];
2158 team = th->th.th_team;
2160 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2161 nteams = th->th.th_teams_size.nteams;
2163 team_id = team->t.t_master_tid;
2164 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2168 trip_count = *pupper - *plower + 1;
2169 }
else if (incr == -1) {
2170 trip_count = *plower - *pupper + 1;
2171 }
else if (incr > 0) {
2173 trip_count = (UT)(*pupper - *plower) / incr + 1;
2175 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2178 if (trip_count <= nteams) {
2180 __kmp_static == kmp_sch_static_greedy ||
2182 kmp_sch_static_balanced);
2184 if (team_id < trip_count) {
2185 *pupper = *plower = *plower + team_id * incr;
2187 *plower = *pupper + incr;
2189 if (plastiter != NULL)
2190 *plastiter = (team_id == trip_count - 1);
2192 if (__kmp_static == kmp_sch_static_balanced) {
2193 UT chunk = trip_count / nteams;
2194 UT extras = trip_count % nteams;
2196 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2197 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2198 if (plastiter != NULL)
2199 *plastiter = (team_id == nteams - 1);
2202 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2204 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2206 *plower += team_id * chunk_inc_count;
2207 *pupper = *plower + chunk_inc_count - incr;
2210 if (*pupper < *plower)
2211 *pupper = traits_t<T>::max_value;
2212 if (plastiter != NULL)
2213 *plastiter = *plower <= upper && *pupper > upper - incr;
2214 if (*pupper > upper)
2217 if (*pupper > *plower)
2218 *pupper = traits_t<T>::min_value;
2219 if (plastiter != NULL)
2220 *plastiter = *plower >= upper && *pupper < upper - incr;
2221 if (*pupper < upper)
2253 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2254 KMP_DEBUG_ASSERT(__kmp_init_serial);
2255 #if OMPT_SUPPORT && OMPT_OPTIONAL 2256 OMPT_STORE_RETURN_ADDRESS(gtid);
2258 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2265 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2266 KMP_DEBUG_ASSERT(__kmp_init_serial);
2267 #if OMPT_SUPPORT && OMPT_OPTIONAL 2268 OMPT_STORE_RETURN_ADDRESS(gtid);
2270 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2278 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2279 KMP_DEBUG_ASSERT(__kmp_init_serial);
2280 #if OMPT_SUPPORT && OMPT_OPTIONAL 2281 OMPT_STORE_RETURN_ADDRESS(gtid);
2283 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2291 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2292 KMP_DEBUG_ASSERT(__kmp_init_serial);
2293 #if OMPT_SUPPORT && OMPT_OPTIONAL 2294 OMPT_STORE_RETURN_ADDRESS(gtid);
2296 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2310 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2312 KMP_DEBUG_ASSERT(__kmp_init_serial);
2313 #if OMPT_SUPPORT && OMPT_OPTIONAL 2314 OMPT_STORE_RETURN_ADDRESS(gtid);
2316 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2317 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2320 void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2322 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2324 KMP_DEBUG_ASSERT(__kmp_init_serial);
2325 #if OMPT_SUPPORT && OMPT_OPTIONAL 2326 OMPT_STORE_RETURN_ADDRESS(gtid);
2328 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2329 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2332 void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2334 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2336 KMP_DEBUG_ASSERT(__kmp_init_serial);
2337 #if OMPT_SUPPORT && OMPT_OPTIONAL 2338 OMPT_STORE_RETURN_ADDRESS(gtid);
2340 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2341 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2344 void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2346 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2348 KMP_DEBUG_ASSERT(__kmp_init_serial);
2349 #if OMPT_SUPPORT && OMPT_OPTIONAL 2350 OMPT_STORE_RETURN_ADDRESS(gtid);
2352 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2353 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2370 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2371 #if OMPT_SUPPORT && OMPT_OPTIONAL 2372 OMPT_STORE_RETURN_ADDRESS(gtid);
2374 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL 2377 OMPT_LOAD_RETURN_ADDRESS(gtid)
2386 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2388 #if OMPT_SUPPORT && OMPT_OPTIONAL 2389 OMPT_STORE_RETURN_ADDRESS(gtid);
2391 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL 2394 OMPT_LOAD_RETURN_ADDRESS(gtid)
2403 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL 2405 OMPT_STORE_RETURN_ADDRESS(gtid);
2407 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2408 #if OMPT_SUPPORT && OMPT_OPTIONAL 2410 OMPT_LOAD_RETURN_ADDRESS(gtid)
2419 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL 2422 OMPT_STORE_RETURN_ADDRESS(gtid);
2424 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL 2427 OMPT_LOAD_RETURN_ADDRESS(gtid)
2439 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2446 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2453 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2460 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2467 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2468 return value == checker;
2471 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2472 return value != checker;
2475 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2476 return value < checker;
2479 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2480 return value >= checker;
2483 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2484 return value <= checker;
2488 __kmp_wait_yield_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
2489 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2493 volatile kmp_uint32 *spin = spinner;
2494 kmp_uint32 check = checker;
2496 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2499 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2500 KMP_INIT_YIELD(spins);
2502 while (!f(r = TCR_4(*spin), check)) {
2503 KMP_FSYNC_SPIN_PREPARE(obj);
2511 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2512 KMP_YIELD_SPIN(spins);
2514 KMP_FSYNC_SPIN_ACQUIRED(obj);
2518 void __kmp_wait_yield_4_ptr(
2519 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(
void *, kmp_uint32),
2523 void *spin = spinner;
2524 kmp_uint32 check = checker;
2526 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
2528 KMP_FSYNC_SPIN_INIT(obj, spin);
2529 KMP_INIT_YIELD(spins);
2531 while (!f(spin, check)) {
2532 KMP_FSYNC_SPIN_PREPARE(obj);
2535 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2536 KMP_YIELD_SPIN(spins);
2538 KMP_FSYNC_SPIN_ACQUIRED(obj);
2543 #ifdef KMP_GOMP_COMPAT 2545 void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
2547 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2549 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2553 void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2555 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2557 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2561 void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2563 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2565 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2569 void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2571 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2573 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2577 void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
2578 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2581 void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
2582 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2585 void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
2586 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2589 void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
2590 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)