21 #include "kmp_error.h"
24 #include "kmp_stats.h"
26 #if KMP_USE_X87CONTROL
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
36 #include "ompt-specific.h"
42 void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
45 KMP_DEBUG_ASSERT(gtid_ref);
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
60 void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
72 static inline int __kmp_get_monotonicity(
enum sched_type schedule,
73 bool use_hier =
false) {
77 monotonicity = SCHEDULE_MONOTONIC;
78 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79 monotonicity = SCHEDULE_NONMONOTONIC;
80 else if (SCHEDULE_HAS_MONOTONIC(schedule))
81 monotonicity = SCHEDULE_MONOTONIC;
96 void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
97 dispatch_private_info_template<T> *pr,
99 typename traits_t<T>::signed_t st,
101 kmp_uint64 *cur_chunk,
103 typename traits_t<T>::signed_t chunk,
105 typedef typename traits_t<T>::unsigned_t UT;
106 typedef typename traits_t<T>::floating_t DBL;
116 typedef typename traits_t<T>::signed_t ST;
120 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called "
121 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123 traits_t<T>::spec, traits_t<T>::spec,
124 traits_t<ST>::spec, traits_t<ST>::spec,
125 traits_t<T>::spec, traits_t<T>::spec);
126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127 __kmp_str_free(&buff);
131 th = __kmp_threads[gtid];
132 team = th->th.th_team;
133 active = !team->t.t_serialized;
136 int itt_need_metadata_reporting =
137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139 team->t.t_active_level == 1;
142 #if KMP_USE_HIER_SCHED
143 use_hier = pr->flags.use_hier;
149 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
154 pr->flags.nomerge = TRUE;
158 pr->flags.nomerge = FALSE;
160 pr->type_size = traits_t<T>::type_size;
162 pr->flags.ordered = TRUE;
166 pr->flags.ordered = FALSE;
169 if (pr->flags.ordered) {
170 monotonicity = SCHEDULE_MONOTONIC;
174 schedule = __kmp_static;
176 if (schedule == kmp_sch_runtime) {
179 schedule = team->t.t_sched.r_sched_type;
180 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
185 schedule = __kmp_guided;
187 schedule = __kmp_static;
191 chunk = team->t.t_sched.chunk;
200 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: "
201 "schedule:%%d chunk:%%%s\n",
203 KD_TRACE(10, (buff, gtid, schedule, chunk));
204 __kmp_str_free(&buff);
209 schedule = __kmp_guided;
212 chunk = KMP_DEFAULT_CHUNK;
218 schedule = __kmp_auto;
223 buff = __kmp_str_format(
224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225 "schedule:%%d chunk:%%%s\n",
227 KD_TRACE(10, (buff, gtid, schedule, chunk));
228 __kmp_str_free(&buff);
232 #if KMP_STATIC_STEAL_ENABLED
234 if (schedule == kmp_sch_dynamic_chunked) {
235 if (monotonicity == SCHEDULE_NONMONOTONIC)
236 schedule = kmp_sch_static_steal;
240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241 schedule = kmp_sch_guided_iterative_chunked;
242 KMP_WARNING(DispatchManyThreads);
246 schedule = team->t.t_sched.r_sched_type;
247 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
252 schedule == __kmp_static) {
253 schedule = kmp_sch_static_balanced_chunked;
258 chunk = team->t.t_sched.chunk * chunk;
268 buff = __kmp_str_format(
269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
272 KD_TRACE(10, (buff, gtid, schedule, chunk));
273 __kmp_str_free(&buff);
277 pr->u.p.parm1 = chunk;
280 "unknown scheduling type");
284 if (__kmp_env_consistency_check) {
286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
301 tc = (UT)(lb - ub) / (-st) + 1;
309 tc = (UT)(ub - lb) / st + 1;
315 #if KMP_STATS_ENABLED
316 if (KMP_MASTER_GTID(gtid)) {
327 pr->u.p.last_upper = ub + st;
333 if (pr->flags.ordered) {
334 pr->ordered_bumped = 0;
335 pr->u.p.ordered_lower = 1;
336 pr->u.p.ordered_upper = 0;
341 #if (KMP_STATIC_STEAL_ENABLED)
342 case kmp_sch_static_steal: {
346 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
349 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350 if (nproc > 1 && ntc >= nproc) {
353 T small_chunk, extras;
355 small_chunk = ntc / nproc;
356 extras = ntc % nproc;
358 init =
id * small_chunk + (
id < extras ? id : extras);
359 pr->u.p.count = init;
360 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0);
366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367 pr->u.p.parm4 = (
id + 1) % nproc;
369 if (traits_t<T>::type_size > 4) {
375 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
376 th->th.th_dispatch->th_steal_lock =
377 (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
378 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
383 schedule = kmp_sch_dynamic_chunked;
384 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d switching to "
385 "kmp_sch_dynamic_chunked\n",
387 if (pr->u.p.parm1 <= 0)
388 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
393 case kmp_sch_static_balanced: {
398 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
408 pr->u.p.parm1 = (
id == tc - 1);
411 pr->u.p.parm1 = FALSE;
415 T small_chunk = tc / nproc;
416 T extras = tc % nproc;
417 init =
id * small_chunk + (
id < extras ? id : extras);
418 limit = init + small_chunk - (
id < extras ? 0 : 1);
419 pr->u.p.parm1 = (
id == nproc - 1);
425 pr->u.p.parm1 = TRUE;
429 pr->u.p.parm1 = FALSE;
435 if (itt_need_metadata_reporting)
437 *cur_chunk = limit - init + 1;
440 pr->u.p.lb = lb + init;
441 pr->u.p.ub = lb + limit;
444 T ub_tmp = lb + limit * st;
445 pr->u.p.lb = lb + init * st;
449 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
451 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
454 if (pr->flags.ordered) {
455 pr->u.p.ordered_lower = init;
456 pr->u.p.ordered_upper = limit;
460 case kmp_sch_static_balanced_chunked: {
463 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464 " -> falling-through to static_greedy\n",
466 schedule = kmp_sch_static_greedy;
468 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
474 case kmp_sch_guided_iterative_chunked: {
477 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
482 if ((2L * chunk + 1) * nproc >= tc) {
484 schedule = kmp_sch_dynamic_chunked;
487 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488 *(
double *)&pr->u.p.parm3 =
489 guided_flt_param / nproc;
492 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
493 "kmp_sch_static_greedy\n",
495 schedule = kmp_sch_static_greedy;
499 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
505 case kmp_sch_guided_analytical_chunked: {
506 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
507 "kmp_sch_guided_analytical_chunked case\n",
511 if ((2L * chunk + 1) * nproc >= tc) {
513 schedule = kmp_sch_dynamic_chunked;
518 #if KMP_USE_X87CONTROL
528 unsigned int oldFpcw = _control87(0, 0);
529 _control87(_PC_64, _MCW_PC);
532 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
539 x = (
long double)1.0 - (
long double)0.5 / nproc;
550 ptrdiff_t natural_alignment =
551 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
555 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
560 *(DBL *)&pr->u.p.parm3 = x;
573 p = __kmp_pow<UT>(x, right);
578 }
while (p > target && right < (1 << 27));
586 while (left + 1 < right) {
587 mid = (left + right) / 2;
588 if (__kmp_pow<UT>(x, mid) > target) {
597 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598 __kmp_pow<UT>(x, cross) <= target);
601 pr->u.p.parm2 = cross;
604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
607 #define GUIDED_ANALYTICAL_WORKAROUND (x)
610 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
613 #if KMP_USE_X87CONTROL
615 _control87(oldFpcw, _MCW_PC);
619 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
620 "kmp_sch_static_greedy\n",
622 schedule = kmp_sch_static_greedy;
628 case kmp_sch_static_greedy:
631 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
633 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
635 case kmp_sch_static_chunked:
636 case kmp_sch_dynamic_chunked:
637 if (pr->u.p.parm1 <= 0) {
638 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
640 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
641 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
644 case kmp_sch_trapezoidal: {
647 T parm1, parm2, parm3, parm4;
649 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
655 parm2 = (tc / (2 * nproc));
665 }
else if (parm1 > parm2) {
670 parm3 = (parm2 + parm1);
671 parm3 = (2 * tc + parm3 - 1) / parm3;
679 parm4 = (parm2 - parm1) / parm4;
686 pr->u.p.parm1 = parm1;
687 pr->u.p.parm2 = parm2;
688 pr->u.p.parm3 = parm3;
689 pr->u.p.parm4 = parm4;
694 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
695 KMP_HNT(GetNewerLibrary),
700 pr->schedule = schedule;
703 #if KMP_USE_HIER_SCHED
704 template <
typename T>
705 inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
706 typename traits_t<T>::signed_t st);
709 __kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
710 kmp_int32 ub, kmp_int32 st) {
711 __kmp_dispatch_init_hierarchy<kmp_int32>(
712 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
718 kmp_uint32 ub, kmp_int32 st) {
719 __kmp_dispatch_init_hierarchy<kmp_uint32>(
720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
725 __kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
726 kmp_int64 ub, kmp_int64 st) {
727 __kmp_dispatch_init_hierarchy<kmp_int64>(
728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
734 kmp_uint64 ub, kmp_int64 st) {
735 __kmp_dispatch_init_hierarchy<kmp_uint64>(
736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743 for (
int i = 0; i < num_disp_buff; ++i) {
746 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
747 &team->t.t_disp_buffer[i]);
749 sh->hier->deallocate();
750 __kmp_free(sh->hier);
758 template <
typename T>
761 T ub,
typename traits_t<T>::signed_t st,
762 typename traits_t<T>::signed_t chunk,
int push_ws) {
763 typedef typename traits_t<T>::unsigned_t UT;
768 kmp_uint32 my_buffer_index;
769 dispatch_private_info_template<T> *pr;
770 dispatch_shared_info_template<T>
volatile *sh;
772 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
773 sizeof(dispatch_private_info));
774 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
775 sizeof(dispatch_shared_info));
777 if (!TCR_4(__kmp_init_parallel))
778 __kmp_parallel_initialize();
780 __kmp_resume_if_soft_paused();
782 #if INCLUDE_SSC_MARKS
783 SSC_MARK_DISPATCH_INIT();
786 typedef typename traits_t<T>::signed_t ST;
790 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d "
791 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
792 traits_t<ST>::spec, traits_t<T>::spec,
793 traits_t<T>::spec, traits_t<ST>::spec);
794 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
795 __kmp_str_free(&buff);
799 th = __kmp_threads[gtid];
800 team = th->th.th_team;
801 active = !team->t.t_serialized;
802 th->th.th_ident = loc;
807 if (schedule == __kmp_static) {
813 #if KMP_USE_HIER_SCHED
819 my_buffer_index = th->th.th_dispatch->th_disp_index;
820 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
822 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
823 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
828 if (pr->flags.use_hier) {
830 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. "
831 "Disabling hierarchical scheduling.\n",
833 pr->flags.use_hier = FALSE;
836 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
839 if (!ordered && !pr->flags.use_hier)
840 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
842 #endif // KMP_USE_HIER_SCHED
845 kmp_uint64 cur_chunk = chunk;
846 int itt_need_metadata_reporting =
847 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
848 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
849 team->t.t_active_level == 1;
852 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
853 th->th.th_dispatch->th_disp_buffer);
855 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
856 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
858 my_buffer_index = th->th.th_dispatch->th_disp_index++;
861 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
863 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
865 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
866 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
870 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
874 chunk, (T)th->th.th_team_nproc,
875 (T)th->th.th_info.ds.ds_tid);
877 if (pr->flags.ordered == 0) {
878 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
879 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
881 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
882 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
890 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
891 "sh->buffer_index:%d\n",
892 gtid, my_buffer_index, sh->buffer_index));
893 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
894 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
898 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
899 "sh->buffer_index:%d\n",
900 gtid, my_buffer_index, sh->buffer_index));
902 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
903 th->th.th_dispatch->th_dispatch_sh_current =
904 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
906 if (pr->flags.ordered) {
907 __kmp_itt_ordered_init(gtid);
910 if (itt_need_metadata_reporting) {
912 kmp_uint64 schedtype = 0;
914 case kmp_sch_static_chunked:
915 case kmp_sch_static_balanced:
917 case kmp_sch_static_greedy:
918 cur_chunk = pr->u.p.parm1;
920 case kmp_sch_dynamic_chunked:
923 case kmp_sch_guided_iterative_chunked:
924 case kmp_sch_guided_analytical_chunked:
934 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
936 #if KMP_USE_HIER_SCHED
937 if (pr->flags.use_hier) {
939 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
941 #endif // KMP_USER_HIER_SCHED
949 buff = __kmp_str_format(
950 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
952 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
953 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
954 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
955 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
956 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
957 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
958 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
959 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
960 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
961 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
962 __kmp_str_free(&buff);
965 #if (KMP_STATIC_STEAL_ENABLED)
971 if (schedule == kmp_sch_static_steal) {
975 volatile T *p = &pr->u.p.static_steal_counter;
978 #endif // ( KMP_STATIC_STEAL_ENABLED )
980 #if OMPT_SUPPORT && OMPT_OPTIONAL
981 if (ompt_enabled.ompt_callback_work) {
982 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
983 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
984 ompt_callbacks.ompt_callback(ompt_callback_work)(
985 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
986 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
989 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
997 template <
typename UT>
998 static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
999 typedef typename traits_t<UT>::signed_t ST;
1000 kmp_info_t *th = __kmp_threads[gtid];
1002 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
1003 if (!th->th.th_team->t.t_serialized) {
1005 dispatch_private_info_template<UT> *pr =
1006 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1007 th->th.th_dispatch->th_dispatch_pr_current);
1008 dispatch_shared_info_template<UT>
volatile *sh =
1009 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1010 th->th.th_dispatch->th_dispatch_sh_current);
1011 KMP_DEBUG_ASSERT(pr);
1012 KMP_DEBUG_ASSERT(sh);
1013 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1014 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1016 if (pr->ordered_bumped) {
1019 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1021 pr->ordered_bumped = 0;
1023 UT lower = pr->u.p.ordered_lower;
1029 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: "
1030 "ordered_iteration:%%%s lower:%%%s\n",
1031 traits_t<UT>::spec, traits_t<UT>::spec);
1032 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1033 __kmp_str_free(&buff);
1037 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1038 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1044 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: "
1045 "ordered_iteration:%%%s lower:%%%s\n",
1046 traits_t<UT>::spec, traits_t<UT>::spec);
1047 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1048 __kmp_str_free(&buff);
1052 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1055 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1058 #ifdef KMP_GOMP_COMPAT
1060 template <
typename UT>
1061 static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1062 typedef typename traits_t<UT>::signed_t ST;
1063 kmp_info_t *th = __kmp_threads[gtid];
1065 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1066 if (!th->th.th_team->t.t_serialized) {
1068 dispatch_private_info_template<UT> *pr =
1069 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1070 th->th.th_dispatch->th_dispatch_pr_current);
1071 dispatch_shared_info_template<UT>
volatile *sh =
1072 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1073 th->th.th_dispatch->th_dispatch_sh_current);
1074 KMP_DEBUG_ASSERT(pr);
1075 KMP_DEBUG_ASSERT(sh);
1076 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1077 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1080 UT lower = pr->u.p.ordered_lower;
1081 UT upper = pr->u.p.ordered_upper;
1082 UT inc = upper - lower + 1;
1084 if (pr->ordered_bumped == inc) {
1087 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1089 pr->ordered_bumped = 0;
1091 inc -= pr->ordered_bumped;
1097 buff = __kmp_str_format(
1098 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1099 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1100 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1101 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1102 __kmp_str_free(&buff);
1106 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1107 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1110 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting "
1111 "ordered_bumped to zero\n",
1113 pr->ordered_bumped = 0;
1119 buff = __kmp_str_format(
1120 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1121 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1122 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1123 traits_t<UT>::spec);
1125 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1126 __kmp_str_free(&buff);
1130 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1134 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1139 template <
typename T>
1140 int __kmp_dispatch_next_algorithm(
int gtid,
1141 dispatch_private_info_template<T> *pr,
1142 dispatch_shared_info_template<T>
volatile *sh,
1143 kmp_int32 *p_last, T *p_lb, T *p_ub,
1144 typename traits_t<T>::signed_t *p_st, T nproc,
1146 typedef typename traits_t<T>::unsigned_t UT;
1147 typedef typename traits_t<T>::signed_t ST;
1148 typedef typename traits_t<T>::floating_t DBL;
1153 UT limit, trip, init;
1154 kmp_info_t *th = __kmp_threads[gtid];
1155 kmp_team_t *team = th->th.th_team;
1157 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1158 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1159 KMP_DEBUG_ASSERT(pr);
1160 KMP_DEBUG_ASSERT(sh);
1161 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1167 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1168 "sh:%%p nproc:%%%s tid:%%%s\n",
1169 traits_t<T>::spec, traits_t<T>::spec);
1170 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1171 __kmp_str_free(&buff);
1176 if (pr->u.p.tc == 0) {
1178 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1184 switch (pr->schedule) {
1185 #if (KMP_STATIC_STEAL_ENABLED)
1186 case kmp_sch_static_steal: {
1187 T chunk = pr->u.p.parm1;
1190 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1193 trip = pr->u.p.tc - 1;
1195 if (traits_t<T>::type_size > 4) {
1198 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1199 KMP_DEBUG_ASSERT(lck != NULL);
1200 if (pr->u.p.count < (UT)pr->u.p.ub) {
1201 __kmp_acquire_lock(lck, gtid);
1203 init = (pr->u.p.count)++;
1204 status = (init < (UT)pr->u.p.ub);
1205 __kmp_release_lock(lck, gtid);
1210 kmp_info_t **other_threads = team->t.t_threads;
1211 int while_limit = pr->u.p.parm3;
1212 int while_index = 0;
1215 while ((!status) && (while_limit != ++while_index)) {
1217 T victimIdx = pr->u.p.parm4;
1218 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1219 dispatch_private_info_template<T> *victim =
1220 reinterpret_cast<dispatch_private_info_template<T> *
>(
1221 other_threads[victimIdx]
1222 ->th.th_dispatch->th_dispatch_pr_current);
1223 while ((victim == NULL || victim == pr ||
1224 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1225 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1226 oldVictimIdx != victimIdx) {
1227 victimIdx = (victimIdx + 1) % nproc;
1228 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1229 other_threads[victimIdx]
1230 ->th.th_dispatch->th_dispatch_pr_current);
1232 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1233 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1238 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1239 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1243 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1244 KMP_ASSERT(lck != NULL);
1245 __kmp_acquire_lock(lck, gtid);
1246 limit = victim->u.p.ub;
1247 if (victim->u.p.count >= limit ||
1248 (remaining = limit - victim->u.p.count) < 2) {
1249 __kmp_release_lock(lck, gtid);
1250 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1255 if (remaining > 3) {
1257 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1258 init = (victim->u.p.ub -= (remaining >> 2));
1261 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1262 init = (victim->u.p.ub -= 1);
1264 __kmp_release_lock(lck, gtid);
1266 KMP_DEBUG_ASSERT(init + 1 <= limit);
1267 pr->u.p.parm4 = victimIdx;
1271 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1272 pr->u.p.count = init + 1;
1274 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1289 union_i4 vold, vnew;
1290 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1293 while (!KMP_COMPARE_AND_STORE_ACQ64(
1294 (
volatile kmp_int64 *)&pr->u.p.count,
1295 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1296 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1298 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1303 init = vnew.p.count;
1304 status = (init < (UT)vnew.p.ub);
1308 kmp_info_t **other_threads = team->t.t_threads;
1309 int while_limit = pr->u.p.parm3;
1310 int while_index = 0;
1314 while ((!status) && (while_limit != ++while_index)) {
1315 union_i4 vold, vnew;
1316 kmp_int32 remaining;
1317 T victimIdx = pr->u.p.parm4;
1318 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1319 dispatch_private_info_template<T> *victim =
1320 reinterpret_cast<dispatch_private_info_template<T> *
>(
1321 other_threads[victimIdx]
1322 ->th.th_dispatch->th_dispatch_pr_current);
1323 while ((victim == NULL || victim == pr ||
1324 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1325 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1326 oldVictimIdx != victimIdx) {
1327 victimIdx = (victimIdx + 1) % nproc;
1328 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1329 other_threads[victimIdx]
1330 ->th.th_dispatch->th_dispatch_pr_current);
1332 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1333 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1338 pr->u.p.parm4 = victimIdx;
1340 vold.b = *(
volatile kmp_int64 *)(&victim->u.p.count);
1343 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1344 if (vnew.p.count >= (UT)vnew.p.ub ||
1345 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1346 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1349 if (remaining > 3) {
1350 vnew.p.ub -= (remaining >> 2);
1354 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1356 if (KMP_COMPARE_AND_STORE_ACQ64(
1357 (
volatile kmp_int64 *)&victim->u.p.count,
1358 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1359 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1361 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1362 vold.p.ub - vnew.p.ub);
1367 vold.p.count = init + 1;
1369 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1371 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1386 start = pr->u.p.parm2;
1388 limit = chunk + init - 1;
1390 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1392 KMP_DEBUG_ASSERT(init <= trip);
1393 if ((last = (limit >= trip)) != 0)
1399 *p_lb = start + init;
1400 *p_ub = start + limit;
1402 *p_lb = start + init * incr;
1403 *p_ub = start + limit * incr;
1406 if (pr->flags.ordered) {
1407 pr->u.p.ordered_lower = init;
1408 pr->u.p.ordered_upper = limit;
1413 #endif // ( KMP_STATIC_STEAL_ENABLED )
1414 case kmp_sch_static_balanced: {
1417 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1420 if ((status = !pr->u.p.count) != 0) {
1424 last = pr->u.p.parm1;
1428 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1432 case kmp_sch_static_greedy:
1434 case kmp_sch_static_chunked: {
1437 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1438 "kmp_sch_static_[affinity|chunked] case\n",
1440 parm1 = pr->u.p.parm1;
1442 trip = pr->u.p.tc - 1;
1443 init = parm1 * (pr->u.p.count + tid);
1445 if ((status = (init <= trip)) != 0) {
1448 limit = parm1 + init - 1;
1450 if ((last = (limit >= trip)) != 0)
1456 pr->u.p.count += nproc;
1459 *p_lb = start + init;
1460 *p_ub = start + limit;
1462 *p_lb = start + init * incr;
1463 *p_ub = start + limit * incr;
1466 if (pr->flags.ordered) {
1467 pr->u.p.ordered_lower = init;
1468 pr->u.p.ordered_upper = limit;
1474 case kmp_sch_dynamic_chunked: {
1475 T chunk = pr->u.p.parm1;
1479 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1482 init = chunk * test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1483 trip = pr->u.p.tc - 1;
1485 if ((status = (init <= trip)) == 0) {
1492 limit = chunk + init - 1;
1495 if ((last = (limit >= trip)) != 0)
1502 *p_lb = start + init;
1503 *p_ub = start + limit;
1505 *p_lb = start + init * incr;
1506 *p_ub = start + limit * incr;
1509 if (pr->flags.ordered) {
1510 pr->u.p.ordered_lower = init;
1511 pr->u.p.ordered_upper = limit;
1517 case kmp_sch_guided_iterative_chunked: {
1518 T chunkspec = pr->u.p.parm1;
1519 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1526 init = sh->u.s.iteration;
1527 remaining = trip - init;
1528 if (remaining <= 0) {
1537 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1539 remaining = trip - init;
1540 if (remaining <= 0) {
1545 if ((T)remaining > chunkspec) {
1546 limit = init + chunkspec - 1;
1549 limit = init + remaining - 1;
1555 (UT)(remaining * *(
double *)&pr->u.p.parm3);
1556 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1557 (ST)init, (ST)limit)) {
1569 *p_lb = start + init * incr;
1570 *p_ub = start + limit * incr;
1571 if (pr->flags.ordered) {
1572 pr->u.p.ordered_lower = init;
1573 pr->u.p.ordered_upper = limit;
1587 T chunk = pr->u.p.parm1;
1589 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1595 init = sh->u.s.iteration;
1596 remaining = trip - init;
1597 if (remaining <= 0) {
1601 KMP_DEBUG_ASSERT(init % chunk == 0);
1603 if ((T)remaining < pr->u.p.parm2) {
1606 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1608 remaining = trip - init;
1609 if (remaining <= 0) {
1614 if ((T)remaining > chunk) {
1615 limit = init + chunk - 1;
1618 limit = init + remaining - 1;
1624 UT span = remaining * (*(
double *)&pr->u.p.parm3);
1625 UT rem = span % chunk;
1627 span += chunk - rem;
1628 limit = init + span;
1629 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1630 (ST)init, (ST)limit)) {
1642 *p_lb = start + init * incr;
1643 *p_ub = start + limit * incr;
1644 if (pr->flags.ordered) {
1645 pr->u.p.ordered_lower = init;
1646 pr->u.p.ordered_upper = limit;
1657 case kmp_sch_guided_analytical_chunked: {
1658 T chunkspec = pr->u.p.parm1;
1660 #if KMP_USE_X87CONTROL
1663 unsigned int oldFpcw;
1664 unsigned int fpcwSet = 0;
1666 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1667 "kmp_sch_guided_analytical_chunked case\n",
1672 KMP_DEBUG_ASSERT(nproc > 1);
1673 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1677 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1678 if (chunkIdx >= (UT)pr->u.p.parm2) {
1681 init = chunkIdx * chunkspec + pr->u.p.count;
1684 if ((status = (init > 0 && init <= trip)) != 0) {
1685 limit = init + chunkspec - 1;
1687 if ((last = (limit >= trip)) != 0)
1697 #if KMP_USE_X87CONTROL
1702 oldFpcw = _control87(0, 0);
1703 _control87(_PC_64, _MCW_PC);
1708 init = __kmp_dispatch_guided_remaining<T>(
1709 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1710 KMP_DEBUG_ASSERT(init);
1714 limit = trip - __kmp_dispatch_guided_remaining<T>(
1715 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1716 KMP_ASSERT(init <= limit);
1718 KMP_DEBUG_ASSERT(limit <= trip);
1725 #if KMP_USE_X87CONTROL
1729 if (fpcwSet && (oldFpcw & fpcwSet))
1730 _control87(oldFpcw, _MCW_PC);
1737 *p_lb = start + init * incr;
1738 *p_ub = start + limit * incr;
1739 if (pr->flags.ordered) {
1740 pr->u.p.ordered_lower = init;
1741 pr->u.p.ordered_upper = limit;
1752 case kmp_sch_trapezoidal: {
1754 T parm2 = pr->u.p.parm2;
1755 T parm3 = pr->u.p.parm3;
1756 T parm4 = pr->u.p.parm4;
1758 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1761 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
1763 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1764 trip = pr->u.p.tc - 1;
1766 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1773 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1776 if ((last = (limit >= trip)) != 0)
1783 *p_lb = start + init;
1784 *p_ub = start + limit;
1786 *p_lb = start + init * incr;
1787 *p_ub = start + limit * incr;
1790 if (pr->flags.ordered) {
1791 pr->u.p.ordered_lower = init;
1792 pr->u.p.ordered_upper = limit;
1799 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
1800 KMP_HNT(GetNewerLibrary),
1808 if (pr->flags.ordered) {
1811 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d "
1812 "ordered_lower:%%%s ordered_upper:%%%s\n",
1813 traits_t<UT>::spec, traits_t<UT>::spec);
1814 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1815 __kmp_str_free(&buff);
1820 buff = __kmp_str_format(
1821 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1822 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1823 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1824 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1825 __kmp_str_free(&buff);
1834 #if OMPT_SUPPORT && OMPT_OPTIONAL
1835 #define OMPT_LOOP_END \
1836 if (status == 0) { \
1837 if (ompt_enabled.ompt_callback_work) { \
1838 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1839 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1840 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1841 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1842 &(task_info->task_data), 0, codeptr); \
1847 #define OMPT_LOOP_END // no-op
1850 #if KMP_STATS_ENABLED
1851 #define KMP_STATS_LOOP_END \
1853 kmp_int64 u, l, t, i; \
1854 l = (kmp_int64)(*p_lb); \
1855 u = (kmp_int64)(*p_ub); \
1856 i = (kmp_int64)(pr->u.p.st); \
1857 if (status == 0) { \
1859 KMP_POP_PARTITIONED_TIMER(); \
1860 } else if (i == 1) { \
1865 } else if (i < 0) { \
1867 t = (l - u) / (-i) + 1; \
1872 t = (u - l) / i + 1; \
1876 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1879 #define KMP_STATS_LOOP_END
1882 template <
typename T>
1883 static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
1885 typename traits_t<T>::signed_t *p_st
1886 #
if OMPT_SUPPORT && OMPT_OPTIONAL
1892 typedef typename traits_t<T>::unsigned_t UT;
1893 typedef typename traits_t<T>::signed_t ST;
1898 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1901 dispatch_private_info_template<T> *pr;
1902 kmp_info_t *th = __kmp_threads[gtid];
1903 kmp_team_t *team = th->th.th_team;
1905 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
1908 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1909 gtid, p_lb, p_ub, p_st, p_last));
1911 if (team->t.t_serialized) {
1913 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1914 th->th.th_dispatch->th_disp_buffer);
1915 KMP_DEBUG_ASSERT(pr);
1917 if ((status = (pr->u.p.tc != 0)) == 0) {
1924 if (__kmp_env_consistency_check) {
1925 if (pr->pushed_ws != ct_none) {
1926 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1929 }
else if (pr->flags.nomerge) {
1932 UT limit, trip, init;
1934 T chunk = pr->u.p.parm1;
1936 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1939 init = chunk * pr->u.p.count++;
1940 trip = pr->u.p.tc - 1;
1942 if ((status = (init <= trip)) == 0) {
1949 if (__kmp_env_consistency_check) {
1950 if (pr->pushed_ws != ct_none) {
1951 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1956 limit = chunk + init - 1;
1959 if ((last = (limit >= trip)) != 0) {
1962 pr->u.p.last_upper = pr->u.p.ub;
1970 *p_lb = start + init;
1971 *p_ub = start + limit;
1973 *p_lb = start + init * incr;
1974 *p_ub = start + limit * incr;
1977 if (pr->flags.ordered) {
1978 pr->u.p.ordered_lower = init;
1979 pr->u.p.ordered_upper = limit;
1984 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d "
1985 "ordered_lower:%%%s ordered_upper:%%%s\n",
1986 traits_t<UT>::spec, traits_t<UT>::spec);
1987 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1988 pr->u.p.ordered_upper));
1989 __kmp_str_free(&buff);
1999 pr->u.p.last_upper = *p_ub;
2010 buff = __kmp_str_format(
2011 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2012 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2013 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2014 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2015 __kmp_str_free(&buff);
2018 #if INCLUDE_SSC_MARKS
2019 SSC_MARK_DISPATCH_NEXT();
2026 dispatch_shared_info_template<T>
volatile *sh;
2028 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2029 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2031 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2032 th->th.th_dispatch->th_dispatch_pr_current);
2033 KMP_DEBUG_ASSERT(pr);
2034 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2035 th->th.th_dispatch->th_dispatch_sh_current);
2036 KMP_DEBUG_ASSERT(sh);
2038 #if KMP_USE_HIER_SCHED
2039 if (pr->flags.use_hier)
2040 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2042 #endif // KMP_USE_HIER_SCHED
2043 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2044 p_st, th->th.th_team_nproc,
2045 th->th.th_info.ds.ds_tid);
2050 num_done = test_then_inc<ST>((
volatile ST *)&sh->u.s.num_done);
2055 buff = __kmp_str_format(
2056 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2057 traits_t<UT>::spec);
2058 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2059 __kmp_str_free(&buff);
2063 #if KMP_USE_HIER_SCHED
2064 pr->flags.use_hier = FALSE;
2066 if ((ST)num_done == th->th.th_team_nproc - 1) {
2067 #if (KMP_STATIC_STEAL_ENABLED)
2068 if (pr->schedule == kmp_sch_static_steal &&
2069 traits_t<T>::type_size > 4) {
2071 kmp_info_t **other_threads = team->t.t_threads;
2073 for (i = 0; i < th->th.th_team_nproc; ++i) {
2074 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2075 KMP_ASSERT(lck != NULL);
2076 __kmp_destroy_lock(lck);
2078 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2086 sh->u.s.num_done = 0;
2087 sh->u.s.iteration = 0;
2090 if (pr->flags.ordered) {
2091 sh->u.s.ordered_iteration = 0;
2096 sh->buffer_index += __kmp_dispatch_num_buffers;
2097 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2098 gtid, sh->buffer_index));
2103 if (__kmp_env_consistency_check) {
2104 if (pr->pushed_ws != ct_none) {
2105 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2109 th->th.th_dispatch->th_deo_fcn = NULL;
2110 th->th.th_dispatch->th_dxo_fcn = NULL;
2111 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2112 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2116 pr->u.p.last_upper = pr->u.p.ub;
2119 if (p_last != NULL && status != 0)
2127 buff = __kmp_str_format(
2128 "__kmp_dispatch_next: T#%%d normal case: "
2129 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2130 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2131 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2132 (p_last ? *p_last : 0), status));
2133 __kmp_str_free(&buff);
2136 #if INCLUDE_SSC_MARKS
2137 SSC_MARK_DISPATCH_NEXT();
2144 template <
typename T>
2145 static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2146 kmp_int32 *plastiter, T *plower, T *pupper,
2147 typename traits_t<T>::signed_t incr) {
2148 typedef typename traits_t<T>::unsigned_t UT;
2155 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2156 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2158 typedef typename traits_t<T>::signed_t ST;
2162 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d "
2163 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2164 traits_t<T>::spec, traits_t<T>::spec,
2165 traits_t<ST>::spec, traits_t<T>::spec);
2166 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2167 __kmp_str_free(&buff);
2171 if (__kmp_env_consistency_check) {
2173 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2176 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2186 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2189 th = __kmp_threads[gtid];
2190 team = th->th.th_team;
2191 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2192 nteams = th->th.th_teams_size.nteams;
2193 team_id = team->t.t_master_tid;
2194 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2198 trip_count = *pupper - *plower + 1;
2199 }
else if (incr == -1) {
2200 trip_count = *plower - *pupper + 1;
2201 }
else if (incr > 0) {
2203 trip_count = (UT)(*pupper - *plower) / incr + 1;
2205 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2208 if (trip_count <= nteams) {
2210 __kmp_static == kmp_sch_static_greedy ||
2212 kmp_sch_static_balanced);
2214 if (team_id < trip_count) {
2215 *pupper = *plower = *plower + team_id * incr;
2217 *plower = *pupper + incr;
2219 if (plastiter != NULL)
2220 *plastiter = (team_id == trip_count - 1);
2222 if (__kmp_static == kmp_sch_static_balanced) {
2223 UT chunk = trip_count / nteams;
2224 UT extras = trip_count % nteams;
2226 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2227 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2228 if (plastiter != NULL)
2229 *plastiter = (team_id == nteams - 1);
2232 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2234 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2236 *plower += team_id * chunk_inc_count;
2237 *pupper = *plower + chunk_inc_count - incr;
2240 if (*pupper < *plower)
2241 *pupper = traits_t<T>::max_value;
2242 if (plastiter != NULL)
2243 *plastiter = *plower <= upper && *pupper > upper - incr;
2244 if (*pupper > upper)
2247 if (*pupper > *plower)
2248 *pupper = traits_t<T>::min_value;
2249 if (plastiter != NULL)
2250 *plastiter = *plower >= upper && *pupper < upper - incr;
2251 if (*pupper < upper)
2283 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2284 KMP_DEBUG_ASSERT(__kmp_init_serial);
2285 #if OMPT_SUPPORT && OMPT_OPTIONAL
2286 OMPT_STORE_RETURN_ADDRESS(gtid);
2288 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2295 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2296 KMP_DEBUG_ASSERT(__kmp_init_serial);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298 OMPT_STORE_RETURN_ADDRESS(gtid);
2300 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2308 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2309 KMP_DEBUG_ASSERT(__kmp_init_serial);
2310 #if OMPT_SUPPORT && OMPT_OPTIONAL
2311 OMPT_STORE_RETURN_ADDRESS(gtid);
2313 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2321 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2322 KMP_DEBUG_ASSERT(__kmp_init_serial);
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL
2324 OMPT_STORE_RETURN_ADDRESS(gtid);
2326 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2340 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2342 KMP_DEBUG_ASSERT(__kmp_init_serial);
2343 #if OMPT_SUPPORT && OMPT_OPTIONAL
2344 OMPT_STORE_RETURN_ADDRESS(gtid);
2346 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2347 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2350 void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2352 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2354 KMP_DEBUG_ASSERT(__kmp_init_serial);
2355 #if OMPT_SUPPORT && OMPT_OPTIONAL
2356 OMPT_STORE_RETURN_ADDRESS(gtid);
2358 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2359 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2362 void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2364 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2366 KMP_DEBUG_ASSERT(__kmp_init_serial);
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368 OMPT_STORE_RETURN_ADDRESS(gtid);
2370 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2371 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2374 void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2376 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2378 KMP_DEBUG_ASSERT(__kmp_init_serial);
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380 OMPT_STORE_RETURN_ADDRESS(gtid);
2382 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2383 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2400 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2401 #if OMPT_SUPPORT && OMPT_OPTIONAL
2402 OMPT_STORE_RETURN_ADDRESS(gtid);
2404 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2405 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2407 OMPT_LOAD_RETURN_ADDRESS(gtid)
2416 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419 OMPT_STORE_RETURN_ADDRESS(gtid);
2421 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2422 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2424 OMPT_LOAD_RETURN_ADDRESS(gtid)
2433 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2435 OMPT_STORE_RETURN_ADDRESS(gtid);
2437 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2438 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2440 OMPT_LOAD_RETURN_ADDRESS(gtid)
2449 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2451 #if OMPT_SUPPORT && OMPT_OPTIONAL
2452 OMPT_STORE_RETURN_ADDRESS(gtid);
2454 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2455 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2457 OMPT_LOAD_RETURN_ADDRESS(gtid)
2469 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2476 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2483 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2490 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2497 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2498 return value == checker;
2501 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2502 return value != checker;
2505 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2506 return value < checker;
2509 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2510 return value >= checker;
2513 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2514 return value <= checker;
2518 __kmp_wait_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
2519 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2523 volatile kmp_uint32 *spin = spinner;
2524 kmp_uint32 check = checker;
2526 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2529 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2530 KMP_INIT_YIELD(spins);
2532 while (!f(r = TCR_4(*spin), check)) {
2533 KMP_FSYNC_SPIN_PREPARE(obj);
2538 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2540 KMP_FSYNC_SPIN_ACQUIRED(obj);
2544 void __kmp_wait_4_ptr(
void *spinner, kmp_uint32 checker,
2545 kmp_uint32 (*pred)(
void *, kmp_uint32),
2549 void *spin = spinner;
2550 kmp_uint32 check = checker;
2552 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
2554 KMP_FSYNC_SPIN_INIT(obj, spin);
2555 KMP_INIT_YIELD(spins);
2557 while (!f(spin, check)) {
2558 KMP_FSYNC_SPIN_PREPARE(obj);
2561 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2563 KMP_FSYNC_SPIN_ACQUIRED(obj);
2568 #ifdef KMP_GOMP_COMPAT
2570 void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
2572 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2574 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2578 void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2580 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2582 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2586 void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2588 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2590 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2594 void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2596 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2598 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2602 void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
2603 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2606 void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
2607 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2610 void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
2611 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2614 void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
2615 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);