23 #if defined(_WIN32_WINNT) && defined(_M_IX86) 25 #define _WIN32_WINNT 0x0502 29 #include "kmp_error.h" 32 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 38 #include "kmp_dispatch.h" 39 #if KMP_USE_HIER_SCHED 40 #include "kmp_dispatch_hier.h" 44 #include "ompt-specific.h" 50 void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
53 KMP_DEBUG_ASSERT(gtid_ref);
55 if (__kmp_env_consistency_check) {
56 th = __kmp_threads[*gtid_ref];
57 if (th->th.th_root->r.r_active &&
58 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
59 #if KMP_USE_DYNAMIC_LOCK 60 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
62 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
68 void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
71 if (__kmp_env_consistency_check) {
72 th = __kmp_threads[*gtid_ref];
73 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
74 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
90 void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
91 dispatch_private_info_template<T> *pr,
93 typename traits_t<T>::signed_t st,
95 kmp_uint64 *cur_chunk,
97 typename traits_t<T>::signed_t chunk,
99 typedef typename traits_t<T>::unsigned_t UT;
100 typedef typename traits_t<T>::signed_t ST;
101 typedef typename traits_t<T>::floating_t DBL;
112 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called " 113 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 114 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
115 traits_t<T>::spec, traits_t<T>::spec,
116 traits_t<ST>::spec, traits_t<ST>::spec,
117 traits_t<T>::spec, traits_t<T>::spec);
118 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
119 __kmp_str_free(&buff);
123 th = __kmp_threads[gtid];
124 team = th->th.th_team;
125 active = !team->t.t_serialized;
128 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
129 __kmp_forkjoin_frames_mode == 3 &&
130 KMP_MASTER_GTID(gtid) &&
132 th->th.th_teams_microtask == NULL &&
134 team->t.t_active_level == 1;
136 #if (KMP_STATIC_STEAL_ENABLED) 137 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
139 schedule = kmp_sch_static_steal;
142 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
146 pr->flags.nomerge = TRUE;
150 pr->flags.nomerge = FALSE;
152 pr->type_size = traits_t<T>::type_size;
154 pr->flags.ordered = TRUE;
158 pr->flags.ordered = FALSE;
162 schedule = __kmp_static;
164 if (schedule == kmp_sch_runtime) {
167 schedule = team->t.t_sched.r_sched_type;
171 schedule = __kmp_guided;
173 schedule = __kmp_static;
177 chunk = team->t.t_sched.chunk;
186 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: " 187 "schedule:%%d chunk:%%%s\n",
189 KD_TRACE(10, (buff, gtid, schedule, chunk));
190 __kmp_str_free(&buff);
195 schedule = __kmp_guided;
198 chunk = KMP_DEFAULT_CHUNK;
204 schedule = __kmp_auto;
209 buff = __kmp_str_format(
210 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 211 "schedule:%%d chunk:%%%s\n",
213 KD_TRACE(10, (buff, gtid, schedule, chunk));
214 __kmp_str_free(&buff);
220 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
221 schedule = kmp_sch_guided_iterative_chunked;
222 KMP_WARNING(DispatchManyThreads);
225 if (schedule == kmp_sch_runtime_simd) {
227 schedule = team->t.t_sched.r_sched_type;
231 schedule == __kmp_static) {
232 schedule = kmp_sch_static_balanced_chunked;
235 schedule = kmp_sch_guided_simd;
237 chunk = team->t.t_sched.chunk * chunk;
247 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d new: schedule:%%d" 250 KD_TRACE(10, (buff, gtid, schedule, chunk));
251 __kmp_str_free(&buff);
255 #endif // OMP_45_ENABLED 256 pr->u.p.parm1 = chunk;
259 "unknown scheduling type");
263 if (__kmp_env_consistency_check) {
265 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
266 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
280 tc = (UT)(lb - ub) / (-st) + 1;
288 tc = (UT)(ub - lb) / st + 1;
300 pr->u.p.last_upper = ub + st;
306 if (pr->flags.ordered) {
307 pr->ordered_bumped = 0;
308 pr->u.p.ordered_lower = 1;
309 pr->u.p.ordered_upper = 0;
314 #if (KMP_STATIC_STEAL_ENABLED) 315 case kmp_sch_static_steal: {
319 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
322 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
323 if (nproc > 1 && ntc >= nproc) {
326 T small_chunk, extras;
328 small_chunk = ntc / nproc;
329 extras = ntc % nproc;
331 init =
id * small_chunk + (
id < extras ? id : extras);
332 pr->u.p.count = init;
333 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0);
337 pr->u.p.parm4 = (
id + 1) % nproc;
339 if (traits_t<T>::type_size > 4) {
345 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
346 th->th.th_dispatch->th_steal_lock =
347 (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
348 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
352 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 353 "kmp_sch_static_balanced\n",
355 schedule = kmp_sch_static_balanced;
361 case kmp_sch_static_balanced: {
366 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
376 pr->u.p.parm1 = (
id == tc - 1);
379 pr->u.p.parm1 = FALSE;
383 T small_chunk = tc / nproc;
384 T extras = tc % nproc;
385 init =
id * small_chunk + (
id < extras ? id : extras);
386 limit = init + small_chunk - (
id < extras ? 0 : 1);
387 pr->u.p.parm1 = (
id == nproc - 1);
393 pr->u.p.parm1 = TRUE;
397 pr->u.p.parm1 = FALSE;
403 if (itt_need_metadata_reporting)
405 *cur_chunk = limit - init + 1;
408 pr->u.p.lb = lb + init;
409 pr->u.p.ub = lb + limit;
412 T ub_tmp = lb + limit * st;
413 pr->u.p.lb = lb + init * st;
417 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
419 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
422 if (pr->flags.ordered) {
423 pr->u.p.ordered_lower = init;
424 pr->u.p.ordered_upper = limit;
429 case kmp_sch_static_balanced_chunked: {
432 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 433 " -> falling-through to static_greedy\n",
435 schedule = kmp_sch_static_greedy;
437 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
442 case kmp_sch_guided_simd:
443 #endif // OMP_45_ENABLED 444 case kmp_sch_guided_iterative_chunked: {
447 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 452 if ((2L * chunk + 1) * nproc >= tc) {
454 schedule = kmp_sch_dynamic_chunked;
457 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
458 *(
double *)&pr->u.p.parm3 =
459 guided_flt_param / nproc;
462 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 463 "kmp_sch_static_greedy\n",
465 schedule = kmp_sch_static_greedy;
469 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
475 case kmp_sch_guided_analytical_chunked: {
476 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 477 "kmp_sch_guided_analytical_chunked case\n",
481 if ((2L * chunk + 1) * nproc >= tc) {
483 schedule = kmp_sch_dynamic_chunked;
488 #if KMP_OS_WINDOWS && KMP_ARCH_X86 498 unsigned int oldFpcw = _control87(0, 0);
499 _control87(_PC_64, _MCW_PC);
502 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
509 x = (
long double)1.0 - (
long double)0.5 / nproc;
520 ptrdiff_t natural_alignment =
521 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
525 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
530 *(DBL *)&pr->u.p.parm3 = x;
543 p = __kmp_pow<UT>(x, right);
548 }
while (p > target && right < (1 << 27));
556 while (left + 1 < right) {
557 mid = (left + right) / 2;
558 if (__kmp_pow<UT>(x, mid) > target) {
567 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
568 __kmp_pow<UT>(x, cross) <= target);
571 pr->u.p.parm2 = cross;
574 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 575 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 577 #define GUIDED_ANALYTICAL_WORKAROUND (x) 580 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
581 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
583 #if KMP_OS_WINDOWS && KMP_ARCH_X86 585 _control87(oldFpcw, _MCW_PC);
589 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 590 "kmp_sch_static_greedy\n",
592 schedule = kmp_sch_static_greedy;
598 case kmp_sch_static_greedy:
601 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
603 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
605 case kmp_sch_static_chunked:
606 case kmp_sch_dynamic_chunked:
607 if (pr->u.p.parm1 <= 0) {
608 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
610 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 611 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
614 case kmp_sch_trapezoidal: {
617 T parm1, parm2, parm3, parm4;
619 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
625 parm2 = (tc / (2 * nproc));
635 }
else if (parm1 > parm2) {
640 parm3 = (parm2 + parm1);
641 parm3 = (2 * tc + parm3 - 1) / parm3;
649 parm4 = (parm2 - parm1) / parm4;
656 pr->u.p.parm1 = parm1;
657 pr->u.p.parm2 = parm2;
658 pr->u.p.parm3 = parm3;
659 pr->u.p.parm4 = parm4;
664 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
665 KMP_HNT(GetNewerLibrary),
670 pr->schedule = schedule;
673 #if KMP_USE_HIER_SCHED 674 template <
typename T>
675 inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
676 typename traits_t<T>::signed_t st);
679 __kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
680 kmp_int32 ub, kmp_int32 st) {
681 __kmp_dispatch_init_hierarchy<kmp_int32>(
682 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
683 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
687 __kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
688 kmp_uint32 ub, kmp_int32 st) {
689 __kmp_dispatch_init_hierarchy<kmp_uint32>(
690 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
691 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
695 __kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
696 kmp_int64 ub, kmp_int64 st) {
697 __kmp_dispatch_init_hierarchy<kmp_int64>(
698 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
699 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
703 __kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
704 kmp_uint64 ub, kmp_int64 st) {
705 __kmp_dispatch_init_hierarchy<kmp_uint64>(
706 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
707 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
711 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
712 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
713 for (
int i = 0; i < num_disp_buff; ++i) {
716 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
717 &team->t.t_disp_buffer[i]);
719 sh->hier->deallocate();
720 __kmp_free(sh->hier);
728 template <
typename T>
731 T ub,
typename traits_t<T>::signed_t st,
732 typename traits_t<T>::signed_t chunk,
int push_ws) {
733 typedef typename traits_t<T>::unsigned_t UT;
734 typedef typename traits_t<T>::signed_t ST;
735 typedef typename traits_t<T>::floating_t DBL;
740 kmp_uint32 my_buffer_index;
741 dispatch_private_info_template<T> *pr;
742 dispatch_shared_info_template<T>
volatile *sh;
744 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
745 sizeof(dispatch_private_info));
746 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
747 sizeof(dispatch_shared_info));
749 if (!TCR_4(__kmp_init_parallel))
750 __kmp_parallel_initialize();
752 #if INCLUDE_SSC_MARKS 753 SSC_MARK_DISPATCH_INIT();
759 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d " 760 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
761 traits_t<ST>::spec, traits_t<T>::spec,
762 traits_t<T>::spec, traits_t<ST>::spec);
763 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
764 __kmp_str_free(&buff);
768 th = __kmp_threads[gtid];
769 team = th->th.th_team;
770 active = !team->t.t_serialized;
771 th->th.th_ident = loc;
776 if (schedule == __kmp_static) {
782 #if KMP_USE_HIER_SCHED 788 my_buffer_index = th->th.th_dispatch->th_disp_index;
789 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
791 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
792 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
797 if (pr->flags.use_hier) {
799 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. " 800 "Disabling hierarchical scheduling.\n",
802 pr->flags.use_hier = FALSE;
805 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
808 if (!ordered && !pr->flags.use_hier)
809 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
811 #endif // KMP_USE_HIER_SCHED 814 kmp_uint64 cur_chunk = chunk;
815 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
816 __kmp_forkjoin_frames_mode == 3 &&
817 KMP_MASTER_GTID(gtid) &&
819 th->th.th_teams_microtask == NULL &&
821 team->t.t_active_level == 1;
824 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
825 th->th.th_dispatch->th_disp_buffer);
827 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
828 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
830 my_buffer_index = th->th.th_dispatch->th_disp_index++;
833 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
835 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
836 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
837 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
838 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
842 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
846 chunk, (T)th->th.th_team_nproc,
847 (T)th->th.th_info.ds.ds_tid);
849 if (pr->flags.ordered == 0) {
850 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
851 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
853 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
854 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
862 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 863 "sh->buffer_index:%d\n",
864 gtid, my_buffer_index, sh->buffer_index));
865 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
866 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
870 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 871 "sh->buffer_index:%d\n",
872 gtid, my_buffer_index, sh->buffer_index));
874 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
875 th->th.th_dispatch->th_dispatch_sh_current =
876 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
878 if (pr->flags.ordered) {
879 __kmp_itt_ordered_init(gtid);
882 if (itt_need_metadata_reporting) {
884 kmp_uint64 schedtype = 0;
886 case kmp_sch_static_chunked:
887 case kmp_sch_static_balanced:
889 case kmp_sch_static_greedy:
890 cur_chunk = pr->u.p.parm1;
892 case kmp_sch_dynamic_chunked:
895 case kmp_sch_guided_iterative_chunked:
896 case kmp_sch_guided_analytical_chunked:
898 case kmp_sch_guided_simd:
908 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
910 #if KMP_USE_HIER_SCHED 911 if (pr->flags.use_hier) {
913 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
915 #endif // KMP_USER_HIER_SCHED 923 buff = __kmp_str_format(
924 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 926 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 927 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
928 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
929 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
930 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
931 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
932 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
933 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
934 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
935 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
936 __kmp_str_free(&buff);
939 #if (KMP_STATIC_STEAL_ENABLED) 945 if (schedule == kmp_sch_static_steal) {
949 volatile T *p = &pr->u.p.static_steal_counter;
952 #endif // ( KMP_STATIC_STEAL_ENABLED ) 954 #if OMPT_SUPPORT && OMPT_OPTIONAL 955 if (ompt_enabled.ompt_callback_work) {
956 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
957 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
958 ompt_callbacks.ompt_callback(ompt_callback_work)(
959 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
960 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
963 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
971 template <
typename UT>
972 static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
973 typedef typename traits_t<UT>::signed_t ST;
974 kmp_info_t *th = __kmp_threads[gtid];
976 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
977 if (!th->th.th_team->t.t_serialized) {
979 dispatch_private_info_template<UT> *pr =
980 reinterpret_cast<dispatch_private_info_template<UT> *
>(
981 th->th.th_dispatch->th_dispatch_pr_current);
982 dispatch_shared_info_template<UT>
volatile *sh =
983 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
984 th->th.th_dispatch->th_dispatch_sh_current);
985 KMP_DEBUG_ASSERT(pr);
986 KMP_DEBUG_ASSERT(sh);
987 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
988 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
990 if (pr->ordered_bumped) {
993 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
995 pr->ordered_bumped = 0;
997 UT lower = pr->u.p.ordered_lower;
1003 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: " 1004 "ordered_iteration:%%%s lower:%%%s\n",
1005 traits_t<UT>::spec, traits_t<UT>::spec);
1006 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1007 __kmp_str_free(&buff);
1011 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1012 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1018 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: " 1019 "ordered_iteration:%%%s lower:%%%s\n",
1020 traits_t<UT>::spec, traits_t<UT>::spec);
1021 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1022 __kmp_str_free(&buff);
1026 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1029 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1032 #ifdef KMP_GOMP_COMPAT 1034 template <
typename UT>
1035 static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1036 typedef typename traits_t<UT>::signed_t ST;
1037 kmp_info_t *th = __kmp_threads[gtid];
1039 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1040 if (!th->th.th_team->t.t_serialized) {
1042 dispatch_private_info_template<UT> *pr =
1043 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1044 th->th.th_dispatch->th_dispatch_pr_current);
1045 dispatch_shared_info_template<UT>
volatile *sh =
1046 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1047 th->th.th_dispatch->th_dispatch_sh_current);
1048 KMP_DEBUG_ASSERT(pr);
1049 KMP_DEBUG_ASSERT(sh);
1050 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1051 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1054 UT lower = pr->u.p.ordered_lower;
1055 UT upper = pr->u.p.ordered_upper;
1056 UT inc = upper - lower + 1;
1058 if (pr->ordered_bumped == inc) {
1061 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1063 pr->ordered_bumped = 0;
1065 inc -= pr->ordered_bumped;
1071 buff = __kmp_str_format(
1072 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1073 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1074 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1075 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1076 __kmp_str_free(&buff);
1080 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1081 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1084 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting " 1085 "ordered_bumped to zero\n",
1087 pr->ordered_bumped = 0;
1093 buff = __kmp_str_format(
1094 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1095 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1096 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1097 traits_t<UT>::spec);
1099 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1100 __kmp_str_free(&buff);
1104 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1108 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1113 template <
typename T>
1114 int __kmp_dispatch_next_algorithm(
int gtid,
1115 dispatch_private_info_template<T> *pr,
1116 dispatch_shared_info_template<T>
volatile *sh,
1117 kmp_int32 *p_last, T *p_lb, T *p_ub,
1118 typename traits_t<T>::signed_t *p_st, T nproc,
1120 typedef typename traits_t<T>::unsigned_t UT;
1121 typedef typename traits_t<T>::signed_t ST;
1122 typedef typename traits_t<T>::floating_t DBL;
1127 UT limit, trip, init;
1128 kmp_info_t *th = __kmp_threads[gtid];
1129 kmp_team_t *team = th->th.th_team;
1131 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1132 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1133 KMP_DEBUG_ASSERT(pr);
1134 KMP_DEBUG_ASSERT(sh);
1135 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1141 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1142 "sh:%%p nproc:%%%s tid:%%%s\n",
1143 traits_t<T>::spec, traits_t<T>::spec);
1144 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1145 __kmp_str_free(&buff);
1150 if (pr->u.p.tc == 0) {
1152 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1158 switch (pr->schedule) {
1159 #if (KMP_STATIC_STEAL_ENABLED) 1160 case kmp_sch_static_steal: {
1161 T chunk = pr->u.p.parm1;
1164 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1167 trip = pr->u.p.tc - 1;
1169 if (traits_t<T>::type_size > 4) {
1172 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1173 KMP_DEBUG_ASSERT(lck != NULL);
1174 if (pr->u.p.count < (UT)pr->u.p.ub) {
1175 __kmp_acquire_lock(lck, gtid);
1177 init = (pr->u.p.count)++;
1178 status = (init < (UT)pr->u.p.ub);
1179 __kmp_release_lock(lck, gtid);
1184 kmp_info_t **other_threads = team->t.t_threads;
1185 int while_limit = nproc;
1186 int while_index = 0;
1189 while ((!status) && (while_limit != ++while_index)) {
1191 T victimIdx = pr->u.p.parm4;
1192 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1193 dispatch_private_info_template<T> *victim =
1194 reinterpret_cast<dispatch_private_info_template<T> *
>(
1195 other_threads[victimIdx]
1196 ->th.th_dispatch->th_dispatch_pr_current);
1197 while ((victim == NULL || victim == pr ||
1198 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1199 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1200 oldVictimIdx != victimIdx) {
1201 victimIdx = (victimIdx + 1) % nproc;
1202 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1203 other_threads[victimIdx]
1204 ->th.th_dispatch->th_dispatch_pr_current);
1206 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1207 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1212 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1213 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1217 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1218 KMP_ASSERT(lck != NULL);
1219 __kmp_acquire_lock(lck, gtid);
1220 limit = victim->u.p.ub;
1221 if (victim->u.p.count >= limit ||
1222 (remaining = limit - victim->u.p.count) < 2) {
1223 __kmp_release_lock(lck, gtid);
1224 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1229 if (remaining > 3) {
1231 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1232 init = (victim->u.p.ub -= (remaining >> 2));
1235 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1236 init = (victim->u.p.ub -= 1);
1238 __kmp_release_lock(lck, gtid);
1240 KMP_DEBUG_ASSERT(init + 1 <= limit);
1241 pr->u.p.parm4 = victimIdx;
1245 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1246 pr->u.p.count = init + 1;
1248 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1263 union_i4 vold, vnew;
1264 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1267 while (!KMP_COMPARE_AND_STORE_ACQ64(
1268 (
volatile kmp_int64 *)&pr->u.p.count,
1269 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1270 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1272 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1277 init = vnew.p.count;
1278 status = (init < (UT)vnew.p.ub);
1282 kmp_info_t **other_threads = team->t.t_threads;
1283 int while_limit = nproc;
1284 int while_index = 0;
1288 while ((!status) && (while_limit != ++while_index)) {
1289 union_i4 vold, vnew;
1290 kmp_int32 remaining;
1291 T victimIdx = pr->u.p.parm4;
1292 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1293 dispatch_private_info_template<T> *victim =
1294 reinterpret_cast<dispatch_private_info_template<T> *
>(
1295 other_threads[victimIdx]
1296 ->th.th_dispatch->th_dispatch_pr_current);
1297 while ((victim == NULL || victim == pr ||
1298 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1299 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1300 oldVictimIdx != victimIdx) {
1301 victimIdx = (victimIdx + 1) % nproc;
1302 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1303 other_threads[victimIdx]
1304 ->th.th_dispatch->th_dispatch_pr_current);
1306 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1307 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1312 pr->u.p.parm4 = victimIdx;
1314 vold.b = *(
volatile kmp_int64 *)(&victim->u.p.count);
1317 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1318 if (vnew.p.count >= (UT)vnew.p.ub ||
1319 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1320 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1323 if (remaining > 3) {
1324 vnew.p.ub -= (remaining >> 2);
1328 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1330 if (KMP_COMPARE_AND_STORE_ACQ64(
1331 (
volatile kmp_int64 *)&victim->u.p.count,
1332 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1333 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1335 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1336 vold.p.ub - vnew.p.ub);
1341 vold.p.count = init + 1;
1343 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1345 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1360 start = pr->u.p.parm2;
1362 limit = chunk + init - 1;
1364 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1366 KMP_DEBUG_ASSERT(init <= trip);
1367 if ((last = (limit >= trip)) != 0)
1373 *p_lb = start + init;
1374 *p_ub = start + limit;
1376 *p_lb = start + init * incr;
1377 *p_ub = start + limit * incr;
1380 if (pr->flags.ordered) {
1381 pr->u.p.ordered_lower = init;
1382 pr->u.p.ordered_upper = limit;
1387 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1388 case kmp_sch_static_balanced: {
1391 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1394 if ((status = !pr->u.p.count) != 0) {
1398 last = pr->u.p.parm1;
1402 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1406 case kmp_sch_static_greedy:
1408 case kmp_sch_static_chunked: {
1411 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1412 "kmp_sch_static_[affinity|chunked] case\n",
1414 parm1 = pr->u.p.parm1;
1416 trip = pr->u.p.tc - 1;
1417 init = parm1 * (pr->u.p.count + tid);
1419 if ((status = (init <= trip)) != 0) {
1422 limit = parm1 + init - 1;
1424 if ((last = (limit >= trip)) != 0)
1430 pr->u.p.count += nproc;
1433 *p_lb = start + init;
1434 *p_ub = start + limit;
1436 *p_lb = start + init * incr;
1437 *p_ub = start + limit * incr;
1440 if (pr->flags.ordered) {
1441 pr->u.p.ordered_lower = init;
1442 pr->u.p.ordered_upper = limit;
1448 case kmp_sch_dynamic_chunked: {
1449 T chunk = pr->u.p.parm1;
1453 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1456 init = chunk * test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1457 trip = pr->u.p.tc - 1;
1459 if ((status = (init <= trip)) == 0) {
1466 limit = chunk + init - 1;
1469 if ((last = (limit >= trip)) != 0)
1476 *p_lb = start + init;
1477 *p_ub = start + limit;
1479 *p_lb = start + init * incr;
1480 *p_ub = start + limit * incr;
1483 if (pr->flags.ordered) {
1484 pr->u.p.ordered_lower = init;
1485 pr->u.p.ordered_upper = limit;
1491 case kmp_sch_guided_iterative_chunked: {
1492 T chunkspec = pr->u.p.parm1;
1493 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1500 init = sh->u.s.iteration;
1501 remaining = trip - init;
1502 if (remaining <= 0) {
1511 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1513 remaining = trip - init;
1514 if (remaining <= 0) {
1519 if ((T)remaining > chunkspec) {
1520 limit = init + chunkspec - 1;
1523 limit = init + remaining - 1;
1529 (UT)(remaining * *(
double *)&pr->u.p.parm3);
1530 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1531 (ST)init, (ST)limit)) {
1543 *p_lb = start + init * incr;
1544 *p_ub = start + limit * incr;
1545 if (pr->flags.ordered) {
1546 pr->u.p.ordered_lower = init;
1547 pr->u.p.ordered_upper = limit;
1559 case kmp_sch_guided_simd: {
1562 T chunk = pr->u.p.parm1;
1564 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1570 init = sh->u.s.iteration;
1571 remaining = trip - init;
1572 if (remaining <= 0) {
1576 KMP_DEBUG_ASSERT(init % chunk == 0);
1578 if ((T)remaining < pr->u.p.parm2) {
1581 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1583 remaining = trip - init;
1584 if (remaining <= 0) {
1589 if ((T)remaining > chunk) {
1590 limit = init + chunk - 1;
1593 limit = init + remaining - 1;
1599 UT span = remaining * (*(
double *)&pr->u.p.parm3);
1600 UT rem = span % chunk;
1602 span += chunk - rem;
1603 limit = init + span;
1604 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1605 (ST)init, (ST)limit)) {
1617 *p_lb = start + init * incr;
1618 *p_ub = start + limit * incr;
1619 if (pr->flags.ordered) {
1620 pr->u.p.ordered_lower = init;
1621 pr->u.p.ordered_upper = limit;
1631 #endif // OMP_45_ENABLED 1633 case kmp_sch_guided_analytical_chunked: {
1634 T chunkspec = pr->u.p.parm1;
1636 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1639 unsigned int oldFpcw;
1640 unsigned int fpcwSet = 0;
1642 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1643 "kmp_sch_guided_analytical_chunked case\n",
1648 KMP_DEBUG_ASSERT(nproc > 1);
1649 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1653 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1654 if (chunkIdx >= (UT)pr->u.p.parm2) {
1657 init = chunkIdx * chunkspec + pr->u.p.count;
1660 if ((status = (init > 0 && init <= trip)) != 0) {
1661 limit = init + chunkspec - 1;
1663 if ((last = (limit >= trip)) != 0)
1673 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1678 oldFpcw = _control87(0, 0);
1679 _control87(_PC_64, _MCW_PC);
1684 init = __kmp_dispatch_guided_remaining<T>(
1685 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1686 KMP_DEBUG_ASSERT(init);
1690 limit = trip - __kmp_dispatch_guided_remaining<T>(
1691 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1692 KMP_ASSERT(init <= limit);
1694 KMP_DEBUG_ASSERT(limit <= trip);
1701 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1705 if (fpcwSet && (oldFpcw & fpcwSet))
1706 _control87(oldFpcw, _MCW_PC);
1713 *p_lb = start + init * incr;
1714 *p_ub = start + limit * incr;
1715 if (pr->flags.ordered) {
1716 pr->u.p.ordered_lower = init;
1717 pr->u.p.ordered_upper = limit;
1728 case kmp_sch_trapezoidal: {
1730 T parm2 = pr->u.p.parm2;
1731 T parm3 = pr->u.p.parm3;
1732 T parm4 = pr->u.p.parm4;
1734 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1737 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
1739 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1740 trip = pr->u.p.tc - 1;
1742 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1749 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1752 if ((last = (limit >= trip)) != 0)
1759 *p_lb = start + init;
1760 *p_ub = start + limit;
1762 *p_lb = start + init * incr;
1763 *p_ub = start + limit * incr;
1766 if (pr->flags.ordered) {
1767 pr->u.p.ordered_lower = init;
1768 pr->u.p.ordered_upper = limit;
1775 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
1776 KMP_HNT(GetNewerLibrary),
1784 if (pr->flags.ordered) {
1787 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d " 1788 "ordered_lower:%%%s ordered_upper:%%%s\n",
1789 traits_t<UT>::spec, traits_t<UT>::spec);
1790 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1791 __kmp_str_free(&buff);
1796 buff = __kmp_str_format(
1797 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1798 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1799 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1800 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1801 __kmp_str_free(&buff);
1810 #if OMPT_SUPPORT && OMPT_OPTIONAL 1811 #define OMPT_LOOP_END \ 1812 if (status == 0) { \ 1813 if (ompt_enabled.ompt_callback_work) { \ 1814 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1815 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1816 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1817 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1818 &(task_info->task_data), 0, codeptr); \ 1823 #define OMPT_LOOP_END // no-op 1826 #if KMP_STATS_ENABLED 1827 #define KMP_STATS_LOOP_END \ 1829 kmp_int64 u, l, t, i; \ 1830 l = (kmp_int64)(*p_lb); \ 1831 u = (kmp_int64)(*p_ub); \ 1832 i = (kmp_int64)(pr->u.p.st); \ 1833 if (status == 0) { \ 1835 KMP_POP_PARTITIONED_TIMER(); \ 1836 } else if (i == 1) { \ 1841 } else if (i < 0) { \ 1843 t = (l - u) / (-i) + 1; \ 1848 t = (u - l) / i + 1; \ 1852 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1855 #define KMP_STATS_LOOP_END 1858 template <
typename T>
1859 static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
1861 typename traits_t<T>::signed_t *p_st
1862 #
if OMPT_SUPPORT && OMPT_OPTIONAL
1868 typedef typename traits_t<T>::unsigned_t UT;
1869 typedef typename traits_t<T>::signed_t ST;
1870 typedef typename traits_t<T>::floating_t DBL;
1875 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1878 dispatch_private_info_template<T> *pr;
1879 kmp_info_t *th = __kmp_threads[gtid];
1880 kmp_team_t *team = th->th.th_team;
1882 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
1885 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1886 gtid, p_lb, p_ub, p_st, p_last));
1888 if (team->t.t_serialized) {
1890 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1891 th->th.th_dispatch->th_disp_buffer);
1892 KMP_DEBUG_ASSERT(pr);
1894 if ((status = (pr->u.p.tc != 0)) == 0) {
1901 if (__kmp_env_consistency_check) {
1902 if (pr->pushed_ws != ct_none) {
1903 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1906 }
else if (pr->flags.nomerge) {
1909 UT limit, trip, init;
1911 T chunk = pr->u.p.parm1;
1913 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1916 init = chunk * pr->u.p.count++;
1917 trip = pr->u.p.tc - 1;
1919 if ((status = (init <= trip)) == 0) {
1926 if (__kmp_env_consistency_check) {
1927 if (pr->pushed_ws != ct_none) {
1928 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1933 limit = chunk + init - 1;
1936 if ((last = (limit >= trip)) != 0) {
1939 pr->u.p.last_upper = pr->u.p.ub;
1947 *p_lb = start + init;
1948 *p_ub = start + limit;
1950 *p_lb = start + init * incr;
1951 *p_ub = start + limit * incr;
1954 if (pr->flags.ordered) {
1955 pr->u.p.ordered_lower = init;
1956 pr->u.p.ordered_upper = limit;
1961 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d " 1962 "ordered_lower:%%%s ordered_upper:%%%s\n",
1963 traits_t<UT>::spec, traits_t<UT>::spec);
1964 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1965 pr->u.p.ordered_upper));
1966 __kmp_str_free(&buff);
1976 pr->u.p.last_upper = *p_ub;
1987 buff = __kmp_str_format(
1988 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1989 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1990 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1991 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1992 __kmp_str_free(&buff);
1995 #if INCLUDE_SSC_MARKS 1996 SSC_MARK_DISPATCH_NEXT();
2003 dispatch_shared_info_template<T>
volatile *sh;
2005 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2006 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2008 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2009 th->th.th_dispatch->th_dispatch_pr_current);
2010 KMP_DEBUG_ASSERT(pr);
2011 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2012 th->th.th_dispatch->th_dispatch_sh_current);
2013 KMP_DEBUG_ASSERT(sh);
2015 #if KMP_USE_HIER_SCHED 2016 if (pr->flags.use_hier)
2017 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2019 #endif // KMP_USE_HIER_SCHED 2020 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2021 p_st, th->th.th_team_nproc,
2022 th->th.th_info.ds.ds_tid);
2027 num_done = test_then_inc<ST>((
volatile ST *)&sh->u.s.num_done);
2032 buff = __kmp_str_format(
2033 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2034 traits_t<UT>::spec);
2035 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2036 __kmp_str_free(&buff);
2040 #if KMP_USE_HIER_SCHED 2041 pr->flags.use_hier = FALSE;
2043 if ((ST)num_done == th->th.th_team_nproc - 1) {
2044 #if (KMP_STATIC_STEAL_ENABLED) 2045 if (pr->schedule == kmp_sch_static_steal &&
2046 traits_t<T>::type_size > 4) {
2048 kmp_info_t **other_threads = team->t.t_threads;
2050 for (i = 0; i < th->th.th_team_nproc; ++i) {
2051 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2052 KMP_ASSERT(lck != NULL);
2053 __kmp_destroy_lock(lck);
2055 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2063 sh->u.s.num_done = 0;
2064 sh->u.s.iteration = 0;
2067 if (pr->flags.ordered) {
2068 sh->u.s.ordered_iteration = 0;
2073 sh->buffer_index += __kmp_dispatch_num_buffers;
2074 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2075 gtid, sh->buffer_index));
2080 if (__kmp_env_consistency_check) {
2081 if (pr->pushed_ws != ct_none) {
2082 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2086 th->th.th_dispatch->th_deo_fcn = NULL;
2087 th->th.th_dispatch->th_dxo_fcn = NULL;
2088 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2089 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2093 pr->u.p.last_upper = pr->u.p.ub;
2096 if (p_last != NULL && status != 0)
2104 buff = __kmp_str_format(
2105 "__kmp_dispatch_next: T#%%d normal case: " 2106 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2107 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2108 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2109 (p_last ? *p_last : 0), status));
2110 __kmp_str_free(&buff);
2113 #if INCLUDE_SSC_MARKS 2114 SSC_MARK_DISPATCH_NEXT();
2121 template <
typename T>
2122 static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2123 kmp_int32 *plastiter, T *plower, T *pupper,
2124 typename traits_t<T>::signed_t incr) {
2125 typedef typename traits_t<T>::unsigned_t UT;
2126 typedef typename traits_t<T>::signed_t ST;
2133 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2134 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2139 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d " 2140 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2141 traits_t<T>::spec, traits_t<T>::spec,
2142 traits_t<ST>::spec, traits_t<T>::spec);
2143 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2144 __kmp_str_free(&buff);
2148 if (__kmp_env_consistency_check) {
2150 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2153 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2163 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2166 th = __kmp_threads[gtid];
2167 team = th->th.th_team;
2169 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2170 nteams = th->th.th_teams_size.nteams;
2172 team_id = team->t.t_master_tid;
2173 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2177 trip_count = *pupper - *plower + 1;
2178 }
else if (incr == -1) {
2179 trip_count = *plower - *pupper + 1;
2180 }
else if (incr > 0) {
2182 trip_count = (UT)(*pupper - *plower) / incr + 1;
2184 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2187 if (trip_count <= nteams) {
2189 __kmp_static == kmp_sch_static_greedy ||
2191 kmp_sch_static_balanced);
2193 if (team_id < trip_count) {
2194 *pupper = *plower = *plower + team_id * incr;
2196 *plower = *pupper + incr;
2198 if (plastiter != NULL)
2199 *plastiter = (team_id == trip_count - 1);
2201 if (__kmp_static == kmp_sch_static_balanced) {
2202 UT chunk = trip_count / nteams;
2203 UT extras = trip_count % nteams;
2205 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2206 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2207 if (plastiter != NULL)
2208 *plastiter = (team_id == nteams - 1);
2211 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2213 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2215 *plower += team_id * chunk_inc_count;
2216 *pupper = *plower + chunk_inc_count - incr;
2219 if (*pupper < *plower)
2220 *pupper = traits_t<T>::max_value;
2221 if (plastiter != NULL)
2222 *plastiter = *plower <= upper && *pupper > upper - incr;
2223 if (*pupper > upper)
2226 if (*pupper > *plower)
2227 *pupper = traits_t<T>::min_value;
2228 if (plastiter != NULL)
2229 *plastiter = *plower >= upper && *pupper < upper - incr;
2230 if (*pupper < upper)
2262 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2263 KMP_DEBUG_ASSERT(__kmp_init_serial);
2264 #if OMPT_SUPPORT && OMPT_OPTIONAL 2265 OMPT_STORE_RETURN_ADDRESS(gtid);
2267 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2274 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2275 KMP_DEBUG_ASSERT(__kmp_init_serial);
2276 #if OMPT_SUPPORT && OMPT_OPTIONAL 2277 OMPT_STORE_RETURN_ADDRESS(gtid);
2279 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2287 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2288 KMP_DEBUG_ASSERT(__kmp_init_serial);
2289 #if OMPT_SUPPORT && OMPT_OPTIONAL 2290 OMPT_STORE_RETURN_ADDRESS(gtid);
2292 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2300 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2301 KMP_DEBUG_ASSERT(__kmp_init_serial);
2302 #if OMPT_SUPPORT && OMPT_OPTIONAL 2303 OMPT_STORE_RETURN_ADDRESS(gtid);
2305 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2319 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2321 KMP_DEBUG_ASSERT(__kmp_init_serial);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL 2323 OMPT_STORE_RETURN_ADDRESS(gtid);
2325 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2326 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2329 void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2331 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2333 KMP_DEBUG_ASSERT(__kmp_init_serial);
2334 #if OMPT_SUPPORT && OMPT_OPTIONAL 2335 OMPT_STORE_RETURN_ADDRESS(gtid);
2337 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2338 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2341 void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2343 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2345 KMP_DEBUG_ASSERT(__kmp_init_serial);
2346 #if OMPT_SUPPORT && OMPT_OPTIONAL 2347 OMPT_STORE_RETURN_ADDRESS(gtid);
2349 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2350 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2353 void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2355 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2357 KMP_DEBUG_ASSERT(__kmp_init_serial);
2358 #if OMPT_SUPPORT && OMPT_OPTIONAL 2359 OMPT_STORE_RETURN_ADDRESS(gtid);
2361 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2362 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2379 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2380 #if OMPT_SUPPORT && OMPT_OPTIONAL 2381 OMPT_STORE_RETURN_ADDRESS(gtid);
2383 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2384 #if OMPT_SUPPORT && OMPT_OPTIONAL 2386 OMPT_LOAD_RETURN_ADDRESS(gtid)
2395 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2397 #if OMPT_SUPPORT && OMPT_OPTIONAL 2398 OMPT_STORE_RETURN_ADDRESS(gtid);
2400 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2401 #if OMPT_SUPPORT && OMPT_OPTIONAL 2403 OMPT_LOAD_RETURN_ADDRESS(gtid)
2412 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2413 #if OMPT_SUPPORT && OMPT_OPTIONAL 2414 OMPT_STORE_RETURN_ADDRESS(gtid);
2416 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL 2419 OMPT_LOAD_RETURN_ADDRESS(gtid)
2428 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2430 #if OMPT_SUPPORT && OMPT_OPTIONAL 2431 OMPT_STORE_RETURN_ADDRESS(gtid);
2433 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL 2436 OMPT_LOAD_RETURN_ADDRESS(gtid)
2448 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2455 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2462 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2469 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2476 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2477 return value == checker;
2480 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2481 return value != checker;
2484 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2485 return value < checker;
2488 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2489 return value >= checker;
2492 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2493 return value <= checker;
2497 __kmp_wait_yield_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
2498 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2502 volatile kmp_uint32 *spin = spinner;
2503 kmp_uint32 check = checker;
2505 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2508 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2509 KMP_INIT_YIELD(spins);
2511 while (!f(r = TCR_4(*spin), check)) {
2512 KMP_FSYNC_SPIN_PREPARE(obj);
2520 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2521 KMP_YIELD_SPIN(spins);
2523 KMP_FSYNC_SPIN_ACQUIRED(obj);
2527 void __kmp_wait_yield_4_ptr(
2528 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(
void *, kmp_uint32),
2532 void *spin = spinner;
2533 kmp_uint32 check = checker;
2535 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
2537 KMP_FSYNC_SPIN_INIT(obj, spin);
2538 KMP_INIT_YIELD(spins);
2540 while (!f(spin, check)) {
2541 KMP_FSYNC_SPIN_PREPARE(obj);
2544 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2545 KMP_YIELD_SPIN(spins);
2547 KMP_FSYNC_SPIN_ACQUIRED(obj);
2552 #ifdef KMP_GOMP_COMPAT 2554 void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
2556 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2558 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2562 void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2564 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2566 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2570 void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2572 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2574 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2578 void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2580 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2582 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2586 void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
2587 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2590 void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
2591 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2594 void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
2595 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2598 void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
2599 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)