14 #include "kmp_wait_release.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
25 #include "tsan_annotations.h"
27 #if KMP_MIC && USE_NGO_STORES
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
40 void __kmp_print_structure(
void);
45 template <
bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
48 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
49 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50 kmp_team_t *team = this_thr->th.th_team;
51 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52 kmp_info_t **other_threads = team->t.t_threads;
56 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
62 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64 __itt_get_timestamp();
69 if (!KMP_MASTER_TID(tid)) {
71 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72 "arrived(%p): %llu => %llu\n",
73 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
80 ANNOTATE_BARRIER_BEGIN(this_thr);
81 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85 int nproc = this_thr->th.th_team_nproc;
88 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 for (i = 1; i < nproc; ++i) {
95 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
97 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98 "arrived(%p) == %llu\n",
99 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
101 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
104 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
107 bool cancelled = flag.wait_cancellable_nosleep(
108 this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
112 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
114 ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
118 if (__kmp_forkjoin_frames_mode == 2) {
119 this_thr->th.th_bar_min_time = KMP_MIN(
120 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
125 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
128 ANNOTATE_REDUCE_AFTER(reduce);
129 OMPT_REDUCTION_DECL(this_thr, gtid);
130 OMPT_REDUCTION_BEGIN;
131 (*reduce)(this_thr->th.th_local.reduce_data,
132 other_threads[i]->th.th_local.reduce_data);
134 ANNOTATE_REDUCE_BEFORE(reduce);
135 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
139 team_bar->b_arrived = new_state;
140 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141 "arrived(%p) = %llu\n",
142 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
147 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148 gtid, team->t.t_id, tid, bt));
152 template <
bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
155 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
156 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
160 if (KMP_MASTER_TID(tid)) {
162 kmp_uint32 nproc = this_thr->th.th_team_nproc;
163 kmp_info_t **other_threads;
165 team = __kmp_threads[gtid]->th.th_team;
166 KMP_DEBUG_ASSERT(team != NULL);
167 other_threads = team->t.t_threads;
169 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
171 gtid, team->t.t_id, tid, bt));
174 #if KMP_BARRIER_ICV_PUSH
176 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177 if (propagate_icvs) {
178 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179 for (i = 1; i < nproc; ++i) {
180 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
182 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183 &team->t.t_implicit_task_taskdata[0].td_icvs);
188 #endif // KMP_BARRIER_ICV_PUSH
191 for (i = 1; i < nproc; ++i) {
195 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
199 (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200 "go(%p): %u => %u\n",
201 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203 other_threads[i]->th.th_bar[bt].bb.b_go,
204 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
212 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216 bool cancelled = flag.wait_cancellable_nosleep(
217 this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
222 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
224 ANNOTATE_BARRIER_END(this_thr);
225 #if USE_ITT_BUILD && USE_ITT_NOTIFY
226 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
229 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
231 __kmp_itt_task_starting(itt_sync_obj);
233 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
236 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237 if (itt_sync_obj != NULL)
239 __kmp_itt_task_finished(itt_sync_obj);
243 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
247 tid = __kmp_tid_from_gtid(gtid);
248 team = __kmp_threads[gtid]->th.th_team;
250 KMP_DEBUG_ASSERT(team != NULL);
251 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
253 (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
259 (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260 gtid, team->t.t_id, tid, bt));
264 static void __kmp_linear_barrier_gather(
265 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
266 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
267 __kmp_linear_barrier_gather_template<false>(
268 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
271 static bool __kmp_linear_barrier_gather_cancellable(
272 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
273 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
274 return __kmp_linear_barrier_gather_template<true>(
275 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
278 static void __kmp_linear_barrier_release(
279 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
280 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
281 __kmp_linear_barrier_release_template<false>(
282 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
285 static bool __kmp_linear_barrier_release_cancellable(
286 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
287 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
288 return __kmp_linear_barrier_release_template<true>(
289 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
294 __kmp_tree_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
295 int tid,
void (*reduce)(
void *,
void *)
296 USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
297 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298 kmp_team_t *team = this_thr->th.th_team;
299 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300 kmp_info_t **other_threads = team->t.t_threads;
301 kmp_uint32 nproc = this_thr->th.th_team_nproc;
302 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303 kmp_uint32 branch_factor = 1 << branch_bits;
305 kmp_uint32 child_tid;
306 kmp_uint64 new_state;
309 20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310 gtid, team->t.t_id, tid, bt));
311 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
313 #if USE_ITT_BUILD && USE_ITT_NOTIFY
315 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317 __itt_get_timestamp();
322 child_tid = (tid << branch_bits) + 1;
323 if (child_tid < nproc) {
325 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
328 kmp_info_t *child_thr = other_threads[child_tid];
329 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
332 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
334 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
337 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338 "arrived(%p) == %llu\n",
339 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
342 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344 ANNOTATE_BARRIER_END(child_thr);
345 #if USE_ITT_BUILD && USE_ITT_NOTIFY
348 if (__kmp_forkjoin_frames_mode == 2) {
349 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350 child_thr->th.th_bar_min_time);
355 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357 team->t.t_id, child_tid));
358 ANNOTATE_REDUCE_AFTER(reduce);
359 OMPT_REDUCTION_DECL(this_thr, gtid);
360 OMPT_REDUCTION_BEGIN;
361 (*reduce)(this_thr->th.th_local.reduce_data,
362 child_thr->th.th_local.reduce_data);
364 ANNOTATE_REDUCE_BEFORE(reduce);
365 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
369 }
while (child <= branch_factor && child_tid < nproc);
372 if (!KMP_MASTER_TID(tid)) {
373 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
376 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377 "arrived(%p): %llu => %llu\n",
378 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
386 ANNOTATE_BARRIER_BEGIN(this_thr);
387 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
392 team->t.t_bar[bt].b_arrived = new_state;
394 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396 "arrived(%p) = %llu\n",
397 gtid, team->t.t_id, tid, team->t.t_id,
398 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
401 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402 gtid, team->t.t_id, tid, bt));
405 static void __kmp_tree_barrier_release(
406 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
407 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
408 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
410 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
412 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413 kmp_uint32 branch_factor = 1 << branch_bits;
415 kmp_uint32 child_tid;
420 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
423 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425 ANNOTATE_BARRIER_END(this_thr);
426 #if USE_ITT_BUILD && USE_ITT_NOTIFY
427 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
430 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
432 __kmp_itt_task_starting(itt_sync_obj);
434 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
437 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438 if (itt_sync_obj != NULL)
440 __kmp_itt_task_finished(itt_sync_obj);
444 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
448 team = __kmp_threads[gtid]->th.th_team;
449 KMP_DEBUG_ASSERT(team != NULL);
450 tid = __kmp_tid_from_gtid(gtid);
452 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
454 (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
458 team = __kmp_threads[gtid]->th.th_team;
459 KMP_DEBUG_ASSERT(team != NULL);
460 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
462 gtid, team->t.t_id, tid, bt));
464 nproc = this_thr->th.th_team_nproc;
465 child_tid = (tid << branch_bits) + 1;
467 if (child_tid < nproc) {
468 kmp_info_t **other_threads = team->t.t_threads;
472 kmp_info_t *child_thr = other_threads[child_tid];
473 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
476 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
478 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
481 #if KMP_BARRIER_ICV_PUSH
483 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484 if (propagate_icvs) {
485 __kmp_init_implicit_task(team->t.t_ident,
486 team->t.t_threads[child_tid], team,
488 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489 &team->t.t_implicit_task_taskdata[0].td_icvs);
492 #endif // KMP_BARRIER_ICV_PUSH
494 (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495 "go(%p): %u => %u\n",
496 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
500 ANNOTATE_BARRIER_BEGIN(child_thr);
501 kmp_flag_64 flag(&child_bar->b_go, child_thr);
505 }
while (child <= branch_factor && child_tid < nproc);
508 20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509 gtid, team->t.t_id, tid, bt));
514 __kmp_hyper_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
515 int tid,
void (*reduce)(
void *,
void *)
516 USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
517 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518 kmp_team_t *team = this_thr->th.th_team;
519 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520 kmp_info_t **other_threads = team->t.t_threads;
521 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524 kmp_uint32 branch_factor = 1 << branch_bits;
530 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531 gtid, team->t.t_id, tid, bt));
532 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
534 #if USE_ITT_BUILD && USE_ITT_NOTIFY
536 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538 __itt_get_timestamp();
543 kmp_flag_64 p_flag(&thr_bar->b_arrived);
544 for (level = 0, offset = 1; offset < num_threads;
545 level += branch_bits, offset <<= branch_bits) {
547 kmp_uint32 child_tid;
549 if (((tid >> level) & (branch_factor - 1)) != 0) {
550 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
553 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
554 "arrived(%p): %llu => %llu\n",
555 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
556 team->t.t_id, parent_tid, &thr_bar->b_arrived,
558 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
563 ANNOTATE_BARRIER_BEGIN(this_thr);
564 p_flag.set_waiter(other_threads[parent_tid]);
570 if (new_state == KMP_BARRIER_UNUSED_STATE)
571 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
572 for (child = 1, child_tid = tid + (1 << level);
573 child < branch_factor && child_tid < num_threads;
574 child++, child_tid += (1 << level)) {
575 kmp_info_t *child_thr = other_threads[child_tid];
576 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
578 kmp_uint32 next_child_tid = child_tid + (1 << level);
580 if (child + 1 < branch_factor && next_child_tid < num_threads)
582 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
585 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
586 "arrived(%p) == %llu\n",
587 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
588 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
590 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
591 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
592 ANNOTATE_BARRIER_END(child_thr);
593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
596 if (__kmp_forkjoin_frames_mode == 2) {
597 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598 child_thr->th.th_bar_min_time);
603 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605 team->t.t_id, child_tid));
606 ANNOTATE_REDUCE_AFTER(reduce);
607 OMPT_REDUCTION_DECL(this_thr, gtid);
608 OMPT_REDUCTION_BEGIN;
609 (*reduce)(this_thr->th.th_local.reduce_data,
610 child_thr->th.th_local.reduce_data);
612 ANNOTATE_REDUCE_BEFORE(reduce);
613 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
618 if (KMP_MASTER_TID(tid)) {
620 if (new_state == KMP_BARRIER_UNUSED_STATE)
621 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
623 team->t.t_bar[bt].b_arrived = new_state;
624 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625 "arrived(%p) = %llu\n",
626 gtid, team->t.t_id, tid, team->t.t_id,
627 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
630 20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631 gtid, team->t.t_id, tid, bt));
635 #define KMP_REVERSE_HYPER_BAR
636 static void __kmp_hyper_barrier_release(
637 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
638 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
639 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
641 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642 kmp_info_t **other_threads;
643 kmp_uint32 num_threads;
644 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645 kmp_uint32 branch_factor = 1 << branch_bits;
647 kmp_uint32 child_tid;
655 if (KMP_MASTER_TID(tid)) {
656 team = __kmp_threads[gtid]->th.th_team;
657 KMP_DEBUG_ASSERT(team != NULL);
658 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
660 gtid, team->t.t_id, tid, bt));
661 #if KMP_BARRIER_ICV_PUSH
662 if (propagate_icvs) {
663 copy_icvs(&thr_bar->th_fixed_icvs,
664 &team->t.t_implicit_task_taskdata[tid].td_icvs);
668 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
671 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673 ANNOTATE_BARRIER_END(this_thr);
674 #if USE_ITT_BUILD && USE_ITT_NOTIFY
675 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
677 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
679 __kmp_itt_task_starting(itt_sync_obj);
681 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
684 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685 if (itt_sync_obj != NULL)
687 __kmp_itt_task_finished(itt_sync_obj);
691 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
695 team = __kmp_threads[gtid]->th.th_team;
696 KMP_DEBUG_ASSERT(team != NULL);
697 tid = __kmp_tid_from_gtid(gtid);
699 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
701 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
705 num_threads = this_thr->th.th_team_nproc;
706 other_threads = team->t.t_threads;
708 #ifdef KMP_REVERSE_HYPER_BAR
710 for (level = 0, offset = 1;
711 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712 level += branch_bits, offset <<= branch_bits)
716 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717 level -= branch_bits, offset >>= branch_bits)
720 for (level = 0, offset = 1; offset < num_threads;
721 level += branch_bits, offset <<= branch_bits)
724 #ifdef KMP_REVERSE_HYPER_BAR
727 child = num_threads >> ((level == 0) ? level : level - 1);
728 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729 child_tid = tid + (child << level);
730 child >= 1; child--, child_tid -= (1 << level))
732 if (((tid >> level) & (branch_factor - 1)) != 0)
737 for (child = 1, child_tid = tid + (1 << level);
738 child < branch_factor && child_tid < num_threads;
739 child++, child_tid += (1 << level))
740 #endif // KMP_REVERSE_HYPER_BAR
742 if (child_tid >= num_threads)
745 kmp_info_t *child_thr = other_threads[child_tid];
746 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
748 kmp_uint32 next_child_tid = child_tid - (1 << level);
750 #ifdef KMP_REVERSE_HYPER_BAR
751 if (child - 1 >= 1 && next_child_tid < num_threads)
753 if (child + 1 < branch_factor && next_child_tid < num_threads)
754 #endif // KMP_REVERSE_HYPER_BAR
756 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
759 #if KMP_BARRIER_ICV_PUSH
761 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
762 #endif // KMP_BARRIER_ICV_PUSH
766 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767 "go(%p): %u => %u\n",
768 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
772 ANNOTATE_BARRIER_BEGIN(child_thr);
773 kmp_flag_64 flag(&child_bar->b_go, child_thr);
778 #if KMP_BARRIER_ICV_PUSH
779 if (propagate_icvs &&
780 !KMP_MASTER_TID(tid)) {
781 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
783 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784 &thr_bar->th_fixed_icvs);
789 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790 gtid, team->t.t_id, tid, bt));
803 static bool __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt,
804 kmp_bstate_t *thr_bar,
805 kmp_uint32 nproc,
int gtid,
806 int tid, kmp_team_t *team) {
808 bool uninitialized = thr_bar->team == NULL;
809 bool team_changed = team != thr_bar->team;
810 bool team_sz_changed = nproc != thr_bar->nproc;
811 bool tid_changed = tid != thr_bar->old_tid;
814 if (uninitialized || team_sz_changed) {
815 __kmp_get_hierarchy(nproc, thr_bar);
818 if (uninitialized || team_sz_changed || tid_changed) {
819 thr_bar->my_level = thr_bar->depth - 1;
820 thr_bar->parent_tid = -1;
824 while (d < thr_bar->depth) {
827 if (d == thr_bar->depth - 2) {
828 thr_bar->parent_tid = 0;
829 thr_bar->my_level = d;
831 }
else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
834 thr_bar->parent_tid = tid - rem;
835 thr_bar->my_level = d;
841 thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
842 thr_bar->old_tid = tid;
843 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844 thr_bar->team = team;
845 thr_bar->parent_bar =
846 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
848 if (uninitialized || team_changed || tid_changed) {
849 thr_bar->team = team;
850 thr_bar->parent_bar =
851 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854 if (uninitialized || team_sz_changed || tid_changed) {
855 thr_bar->nproc = nproc;
856 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857 if (thr_bar->my_level == 0)
858 thr_bar->leaf_kids = 0;
859 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860 thr_bar->leaf_kids = nproc - tid - 1;
861 thr_bar->leaf_state = 0;
862 for (
int i = 0; i < thr_bar->leaf_kids; ++i)
863 ((
char *)&(thr_bar->leaf_state))[7 - i] = 1;
868 static void __kmp_hierarchical_barrier_gather(
869 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
870 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
871 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872 kmp_team_t *team = this_thr->th.th_team;
873 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874 kmp_uint32 nproc = this_thr->th.th_team_nproc;
875 kmp_info_t **other_threads = team->t.t_threads;
876 kmp_uint64 new_state;
878 int level = team->t.t_level;
880 ->th.th_teams_microtask)
881 if (this_thr->th.th_teams_size.nteams > 1)
884 thr_bar->use_oncore_barrier = 1;
886 thr_bar->use_oncore_barrier = 0;
888 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
890 gtid, team->t.t_id, tid, bt));
891 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
893 #if USE_ITT_BUILD && USE_ITT_NOTIFY
895 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
900 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903 if (thr_bar->my_level) {
906 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908 thr_bar->use_oncore_barrier) {
909 if (thr_bar->leaf_kids) {
911 kmp_uint64 leaf_state =
913 ? thr_bar->b_arrived | thr_bar->leaf_state
914 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
917 gtid, team->t.t_id, tid));
918 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
919 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
921 ANNOTATE_REDUCE_AFTER(reduce);
922 OMPT_REDUCTION_DECL(this_thr, gtid);
923 OMPT_REDUCTION_BEGIN;
924 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
926 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
928 gtid, team->t.t_id, tid,
929 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
931 ANNOTATE_BARRIER_END(other_threads[child_tid]);
932 (*reduce)(this_thr->th.th_local.reduce_data,
933 other_threads[child_tid]->th.th_local.reduce_data);
936 ANNOTATE_REDUCE_BEFORE(reduce);
937 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943 for (kmp_uint32 d = 1; d < thr_bar->my_level;
945 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946 skip = thr_bar->skip_per_level[d];
949 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950 kmp_info_t *child_thr = other_threads[child_tid];
951 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
954 "arrived(%p) == %llu\n",
955 gtid, team->t.t_id, tid,
956 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957 child_tid, &child_bar->b_arrived, new_state));
958 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
959 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960 ANNOTATE_BARRIER_END(child_thr);
962 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
964 gtid, team->t.t_id, tid,
965 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
967 ANNOTATE_REDUCE_AFTER(reduce);
968 (*reduce)(this_thr->th.th_local.reduce_data,
969 child_thr->th.th_local.reduce_data);
970 ANNOTATE_REDUCE_BEFORE(reduce);
971 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
976 for (kmp_uint32 d = 0; d < thr_bar->my_level;
978 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979 skip = thr_bar->skip_per_level[d];
982 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983 kmp_info_t *child_thr = other_threads[child_tid];
984 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
987 "arrived(%p) == %llu\n",
988 gtid, team->t.t_id, tid,
989 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990 child_tid, &child_bar->b_arrived, new_state));
991 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
992 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993 ANNOTATE_BARRIER_END(child_thr);
995 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
997 gtid, team->t.t_id, tid,
998 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1000 ANNOTATE_REDUCE_AFTER(reduce);
1001 (*reduce)(this_thr->th.th_local.reduce_data,
1002 child_thr->th.th_local.reduce_data);
1003 ANNOTATE_REDUCE_BEFORE(reduce);
1004 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1012 if (!KMP_MASTER_TID(tid)) {
1013 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1014 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015 gtid, team->t.t_id, tid,
1016 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1022 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023 !thr_bar->use_oncore_barrier) {
1025 ANNOTATE_BARRIER_BEGIN(this_thr);
1026 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1030 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1031 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1032 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1036 team->t.t_bar[bt].b_arrived = new_state;
1037 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1038 "arrived(%p) = %llu\n",
1039 gtid, team->t.t_id, tid, team->t.t_id,
1040 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1044 "barrier type %d\n",
1045 gtid, team->t.t_id, tid, bt));
1048 static void __kmp_hierarchical_barrier_release(
1049 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1050 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1051 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1053 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1055 bool team_change =
false;
1057 if (KMP_MASTER_TID(tid)) {
1058 team = __kmp_threads[gtid]->th.th_team;
1059 KMP_DEBUG_ASSERT(team != NULL);
1060 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1061 "entered barrier type %d\n",
1062 gtid, team->t.t_id, tid, bt));
1065 if (!thr_bar->use_oncore_barrier ||
1066 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1067 thr_bar->team == NULL) {
1069 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1070 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1071 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1072 ANNOTATE_BARRIER_END(this_thr);
1073 TCW_8(thr_bar->b_go,
1074 KMP_INIT_BARRIER_STATE);
1078 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1079 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1080 thr_bar->offset, bt,
1081 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1082 flag.wait(this_thr, TRUE);
1083 if (thr_bar->wait_flag ==
1084 KMP_BARRIER_SWITCHING) {
1085 TCW_8(thr_bar->b_go,
1086 KMP_INIT_BARRIER_STATE);
1088 (RCAST(
volatile char *,
1089 &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1092 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1094 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097 team = __kmp_threads[gtid]->th.th_team;
1098 KMP_DEBUG_ASSERT(team != NULL);
1099 tid = __kmp_tid_from_gtid(gtid);
1103 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1104 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1108 nproc = this_thr->th.th_team_nproc;
1109 int level = team->t.t_level;
1110 if (team->t.t_threads[0]
1111 ->th.th_teams_microtask) {
1112 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1113 this_thr->th.th_teams_level == level)
1115 if (this_thr->th.th_teams_size.nteams > 1)
1119 thr_bar->use_oncore_barrier = 1;
1121 thr_bar->use_oncore_barrier = 0;
1125 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1126 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1127 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1133 #if KMP_BARRIER_ICV_PUSH
1134 if (propagate_icvs) {
1135 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1139 copy_icvs(&thr_bar->th_fixed_icvs,
1140 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1141 }
else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1142 thr_bar->use_oncore_barrier) {
1143 if (!thr_bar->my_level)
1146 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1147 &thr_bar->parent_bar->th_fixed_icvs);
1150 if (thr_bar->my_level)
1152 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1154 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1155 &thr_bar->parent_bar->th_fixed_icvs);
1158 #endif // KMP_BARRIER_ICV_PUSH
1161 if (thr_bar->my_level) {
1162 kmp_int32 child_tid;
1164 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1165 thr_bar->use_oncore_barrier) {
1166 if (KMP_MASTER_TID(tid)) {
1169 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172 ngo_load(&thr_bar->th_fixed_icvs);
1175 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (
int)nproc;
1176 child_tid += thr_bar->skip_per_level[1]) {
1177 kmp_bstate_t *child_bar =
1178 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1179 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1180 "releasing T#%d(%d:%d)"
1181 " go(%p): %u => %u\n",
1182 gtid, team->t.t_id, tid,
1183 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1184 child_tid, &child_bar->b_go, child_bar->b_go,
1185 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1192 TCW_8(thr_bar->b_go,
1193 KMP_INIT_BARRIER_STATE);
1195 if (thr_bar->leaf_kids) {
1198 old_leaf_kids < thr_bar->leaf_kids) {
1199 if (old_leaf_kids) {
1200 thr_bar->b_go |= old_leaf_state;
1203 last = tid + thr_bar->skip_per_level[1];
1206 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1208 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1209 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1213 " T#%d(%d:%d) go(%p): %u => %u\n",
1214 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1215 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1216 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1218 ANNOTATE_BARRIER_BEGIN(child_thr);
1219 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1224 thr_bar->b_go |= thr_bar->leaf_state;
1228 for (
int d = thr_bar->my_level - 1; d >= 0;
1230 last = tid + thr_bar->skip_per_level[d + 1];
1231 kmp_uint32 skip = thr_bar->skip_per_level[d];
1234 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1235 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1236 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1237 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1238 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1239 gtid, team->t.t_id, tid,
1240 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1241 child_tid, &child_bar->b_go, child_bar->b_go,
1242 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1244 ANNOTATE_BARRIER_BEGIN(child_thr);
1245 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1250 #if KMP_BARRIER_ICV_PUSH
1251 if (propagate_icvs && !KMP_MASTER_TID(tid))
1253 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1254 &thr_bar->th_fixed_icvs);
1255 #endif // KMP_BARRIER_ICV_PUSH
1257 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1258 "barrier type %d\n",
1259 gtid, team->t.t_id, tid, bt));
1267 template <
bool cancellable>
struct is_cancellable {};
1268 template <>
struct is_cancellable<true> {
1270 is_cancellable() : value(false) {}
1271 is_cancellable(
bool b) : value(b) {}
1272 is_cancellable &operator=(
bool b) {
1276 operator bool()
const {
return value; }
1278 template <>
struct is_cancellable<false> {
1279 is_cancellable &operator=(
bool b) {
return *
this; }
1280 constexpr
operator bool()
const {
return false; }
1291 template <
bool cancellable = false>
1292 static int __kmp_barrier_template(
enum barrier_type bt,
int gtid,
int is_split,
1293 size_t reduce_size,
void *reduce_data,
1294 void (*reduce)(
void *,
void *)) {
1295 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1296 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1297 int tid = __kmp_tid_from_gtid(gtid);
1298 kmp_info_t *this_thr = __kmp_threads[gtid];
1299 kmp_team_t *team = this_thr->th.th_team;
1301 is_cancellable<cancellable> cancelled;
1302 #if OMPT_SUPPORT && OMPT_OPTIONAL
1303 ompt_data_t *my_task_data;
1304 ompt_data_t *my_parallel_data;
1305 void *return_address;
1306 ompt_sync_region_t barrier_kind;
1309 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1310 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1312 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1314 if (ompt_enabled.enabled) {
1316 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1317 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1318 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1319 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1320 if (ompt_enabled.ompt_callback_sync_region) {
1321 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1322 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325 if (ompt_enabled.ompt_callback_sync_region_wait) {
1326 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1327 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1334 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1338 if (!team->t.t_serialized) {
1341 void *itt_sync_obj = NULL;
1343 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1344 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347 if (__kmp_tasking_mode == tskm_extra_barrier) {
1348 __kmp_tasking_barrier(team, this_thr, gtid);
1350 (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1351 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1358 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1360 this_thr->th.th_team_bt_intervals =
1361 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1362 this_thr->th.th_team_bt_set =
1363 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1365 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1370 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1371 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1375 if (KMP_MASTER_TID(tid)) {
1376 team->t.t_bar[bt].b_master_arrived += 1;
1378 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381 if (reduce != NULL) {
1383 this_thr->th.th_local.reduce_data = reduce_data;
1386 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1388 __kmp_task_team_setup(this_thr, team, 0);
1391 cancelled = __kmp_linear_barrier_gather_cancellable(
1392 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1394 switch (__kmp_barrier_gather_pattern[bt]) {
1395 case bp_hyper_bar: {
1397 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1398 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1399 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402 case bp_hierarchical_bar: {
1403 __kmp_hierarchical_barrier_gather(
1404 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1409 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1410 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1411 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1415 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1416 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1423 if (KMP_MASTER_TID(tid)) {
1425 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1426 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1431 team->t.t_bar[bt].b_team_arrived += 1;
1434 if (__kmp_omp_cancellation) {
1435 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1437 if (cancel_request == cancel_loop ||
1438 cancel_request == cancel_sections) {
1439 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1447 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1448 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1450 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1452 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1453 __kmp_forkjoin_frames_mode &&
1454 this_thr->th.th_teams_microtask == NULL &&
1455 team->t.t_active_level == 1) {
1456 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1457 kmp_uint64 cur_time = __itt_get_timestamp();
1458 kmp_info_t **other_threads = team->t.t_threads;
1459 int nproc = this_thr->th.th_team_nproc;
1461 switch (__kmp_forkjoin_frames_mode) {
1463 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1465 this_thr->th.th_frame_time = cur_time;
1469 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1473 if (__itt_metadata_add_ptr) {
1475 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1478 this_thr->th.th_bar_arrive_time = 0;
1479 for (i = 1; i < nproc; ++i) {
1480 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1481 other_threads[i]->th.th_bar_arrive_time = 0;
1483 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1485 (kmp_uint64)(reduce != NULL));
1487 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1489 this_thr->th.th_frame_time = cur_time;
1497 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1498 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1501 if ((status == 1 || !is_split) && !cancelled) {
1503 cancelled = __kmp_linear_barrier_release_cancellable(
1504 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1506 switch (__kmp_barrier_release_pattern[bt]) {
1507 case bp_hyper_bar: {
1508 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1509 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1510 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1513 case bp_hierarchical_bar: {
1514 __kmp_hierarchical_barrier_release(
1515 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1519 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1520 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1521 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1525 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1526 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1530 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1531 __kmp_task_team_sync(this_thr, team);
1539 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1540 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1544 if (__kmp_tasking_mode != tskm_immediate_exec) {
1545 if (this_thr->th.th_task_team != NULL) {
1547 void *itt_sync_obj = NULL;
1548 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1549 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1550 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1554 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1556 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1557 __kmp_task_team_setup(this_thr, team, 0);
1560 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1561 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1566 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1567 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1568 __kmp_tid_from_gtid(gtid), status));
1571 if (ompt_enabled.enabled) {
1573 if (ompt_enabled.ompt_callback_sync_region_wait) {
1574 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1575 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1578 if (ompt_enabled.ompt_callback_sync_region) {
1579 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1580 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1584 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1587 ANNOTATE_BARRIER_END(&team->t.t_bar);
1590 return (
int)cancelled;
1595 int __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
1596 size_t reduce_size,
void *reduce_data,
1597 void (*reduce)(
void *,
void *)) {
1598 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1602 #if defined(KMP_GOMP_COMPAT)
1604 int __kmp_barrier_gomp_cancel(
int gtid) {
1605 if (__kmp_omp_cancellation) {
1606 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1609 int tid = __kmp_tid_from_gtid(gtid);
1610 kmp_info_t *this_thr = __kmp_threads[gtid];
1611 if (KMP_MASTER_TID(tid)) {
1615 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1616 KMP_BARRIER_STATE_BUMP;
1621 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1626 void __kmp_end_split_barrier(
enum barrier_type bt,
int gtid) {
1627 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1628 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1629 int tid = __kmp_tid_from_gtid(gtid);
1630 kmp_info_t *this_thr = __kmp_threads[gtid];
1631 kmp_team_t *team = this_thr->th.th_team;
1633 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1634 if (!team->t.t_serialized) {
1635 if (KMP_MASTER_GTID(gtid)) {
1636 switch (__kmp_barrier_release_pattern[bt]) {
1637 case bp_hyper_bar: {
1638 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1639 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1640 FALSE USE_ITT_BUILD_ARG(NULL));
1643 case bp_hierarchical_bar: {
1644 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1645 FALSE USE_ITT_BUILD_ARG(NULL));
1649 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1650 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1651 FALSE USE_ITT_BUILD_ARG(NULL));
1655 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1656 FALSE USE_ITT_BUILD_ARG(NULL));
1659 if (__kmp_tasking_mode != tskm_immediate_exec) {
1660 __kmp_task_team_sync(this_thr, team);
1664 ANNOTATE_BARRIER_END(&team->t.t_bar);
1667 void __kmp_join_barrier(
int gtid) {
1668 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1669 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1670 kmp_info_t *this_thr = __kmp_threads[gtid];
1673 kmp_info_t *master_thread;
1679 void *itt_sync_obj = NULL;
1681 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1683 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1689 team = this_thr->th.th_team;
1690 nproc = this_thr->th.th_team_nproc;
1691 KMP_DEBUG_ASSERT((
int)nproc == team->t.t_nproc);
1692 tid = __kmp_tid_from_gtid(gtid);
1694 team_id = team->t.t_id;
1696 master_thread = this_thr->th.th_team_master;
1698 if (master_thread != team->t.t_threads[0]) {
1699 __kmp_print_structure();
1702 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1706 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1707 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1708 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1709 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1710 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1711 gtid, team_id, tid));
1713 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1715 if (ompt_enabled.enabled) {
1717 ompt_data_t *my_task_data;
1718 ompt_data_t *my_parallel_data;
1719 void *codeptr = NULL;
1720 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1721 if (KMP_MASTER_TID(ds_tid) &&
1722 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1723 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1724 codeptr = team->t.ompt_team_info.master_return_address;
1725 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1726 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1727 if (ompt_enabled.ompt_callback_sync_region) {
1728 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1729 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1730 my_task_data, codeptr);
1732 if (ompt_enabled.ompt_callback_sync_region_wait) {
1733 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1734 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1735 my_task_data, codeptr);
1737 if (!KMP_MASTER_TID(ds_tid))
1738 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1740 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1744 if (__kmp_tasking_mode == tskm_extra_barrier) {
1745 __kmp_tasking_barrier(team, this_thr, gtid);
1746 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1750 if (__kmp_tasking_mode != tskm_immediate_exec) {
1751 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1752 "%p, th_task_team = %p\n",
1753 __kmp_gtid_from_thread(this_thr), team_id,
1754 team->t.t_task_team[this_thr->th.th_task_state],
1755 this_thr->th.th_task_team));
1756 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1757 team->t.t_task_team[this_thr->th.th_task_state]);
1766 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1768 this_thr->th.th_team_bt_intervals =
1769 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1770 this_thr->th.th_team_bt_set =
1771 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1773 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1778 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1779 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1782 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1783 case bp_hyper_bar: {
1784 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1785 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1786 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1789 case bp_hierarchical_bar: {
1790 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1791 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1795 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1796 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1797 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1801 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1802 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1810 if (KMP_MASTER_TID(tid)) {
1811 if (__kmp_tasking_mode != tskm_immediate_exec) {
1812 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1814 if (__kmp_display_affinity) {
1815 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1817 #if KMP_STATS_ENABLED
1821 for (
int i = 0; i < team->t.t_nproc; ++i) {
1822 kmp_info_t *team_thread = team->t.t_threads[i];
1823 if (team_thread == this_thr)
1825 team_thread->th.th_stats->setIdleFlag();
1826 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1827 team_thread->th.th_sleep_loc != NULL)
1828 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1829 team_thread->th.th_sleep_loc);
1833 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1834 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1837 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1839 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1840 __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1841 team->t.t_active_level == 1) {
1842 kmp_uint64 cur_time = __itt_get_timestamp();
1843 ident_t *loc = team->t.t_ident;
1844 kmp_info_t **other_threads = team->t.t_threads;
1845 int nproc = this_thr->th.th_team_nproc;
1847 switch (__kmp_forkjoin_frames_mode) {
1849 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1853 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1857 if (__itt_metadata_add_ptr) {
1859 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1862 this_thr->th.th_bar_arrive_time = 0;
1863 for (i = 1; i < nproc; ++i) {
1864 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1865 other_threads[i]->th.th_bar_arrive_time = 0;
1867 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1868 cur_time, delta, 0);
1870 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1872 this_thr->th.th_frame_time = cur_time;
1880 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1881 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1886 if (KMP_MASTER_TID(tid)) {
1889 (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1890 gtid, team_id, tid, nproc));
1897 (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1899 ANNOTATE_BARRIER_END(&team->t.t_bar);
1904 void __kmp_fork_barrier(
int gtid,
int tid) {
1905 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1906 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1907 kmp_info_t *this_thr = __kmp_threads[gtid];
1908 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1910 void *itt_sync_obj = NULL;
1913 ANNOTATE_BARRIER_END(&team->t.t_bar);
1915 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1916 (team != NULL) ? team->t.t_id : -1, tid));
1919 if (KMP_MASTER_TID(tid)) {
1920 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1921 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1923 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1924 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1929 kmp_info_t **other_threads = team->t.t_threads;
1935 for (i = 1; i < team->t.t_nproc; ++i) {
1937 (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1939 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1940 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1941 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1943 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1944 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1945 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1949 if (__kmp_tasking_mode != tskm_immediate_exec) {
1951 __kmp_task_team_setup(this_thr, team, 0);
1960 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1962 this_thr->th.th_team_bt_intervals =
1963 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1964 this_thr->th.th_team_bt_set =
1965 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1967 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1972 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1973 case bp_hyper_bar: {
1974 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1975 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1976 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1979 case bp_hierarchical_bar: {
1980 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1981 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1985 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1986 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1987 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1991 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1992 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1997 if (ompt_enabled.enabled &&
1998 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1999 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2000 ompt_data_t *task_data = (team)
2001 ? OMPT_CUR_TASK_DATA(this_thr)
2002 : &(this_thr->th.ompt_thread_info.task_data);
2003 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2005 void *codeptr = NULL;
2006 if (KMP_MASTER_TID(ds_tid) &&
2007 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2008 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2009 codeptr = team->t.ompt_team_info.master_return_address;
2010 if (ompt_enabled.ompt_callback_sync_region_wait) {
2011 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2012 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2015 if (ompt_enabled.ompt_callback_sync_region) {
2016 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2017 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2021 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2022 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2023 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit);
2029 if (TCR_4(__kmp_global.g.g_done)) {
2030 this_thr->th.th_task_team = NULL;
2032 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2033 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2034 if (!KMP_MASTER_TID(tid)) {
2035 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2037 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2041 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2049 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2050 KMP_DEBUG_ASSERT(team != NULL);
2051 tid = __kmp_tid_from_gtid(gtid);
2053 #if KMP_BARRIER_ICV_PULL
2061 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2062 if (!KMP_MASTER_TID(tid)) {
2066 (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2067 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2069 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2070 &team->t.t_threads[0]
2071 ->th.th_bar[bs_forkjoin_barrier]
2075 #endif // KMP_BARRIER_ICV_PULL
2077 if (__kmp_tasking_mode != tskm_immediate_exec) {
2078 __kmp_task_team_sync(this_thr, team);
2081 #if KMP_AFFINITY_SUPPORTED
2082 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2083 if (proc_bind == proc_bind_intel) {
2085 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2086 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2088 }
else if (proc_bind != proc_bind_false) {
2089 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2090 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
2091 __kmp_gtid_from_thread(this_thr),
2092 this_thr->th.th_current_place));
2094 __kmp_affinity_set_place(gtid);
2097 #endif // KMP_AFFINITY_SUPPORTED
2099 if (__kmp_display_affinity) {
2100 if (team->t.t_display_affinity
2101 #
if KMP_AFFINITY_SUPPORTED
2102 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2106 __kmp_aux_display_affinity(gtid, NULL);
2107 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2108 this_thr->th.th_prev_level = team->t.t_level;
2111 if (!KMP_MASTER_TID(tid))
2112 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2114 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2115 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2116 if (!KMP_MASTER_TID(tid)) {
2118 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2119 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2123 ANNOTATE_BARRIER_END(&team->t.t_bar);
2124 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2125 team->t.t_id, tid));
2128 void __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc,
2129 kmp_internal_control_t *new_icvs,
ident_t *loc) {
2130 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2132 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2133 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2138 #if KMP_BARRIER_ICV_PULL
2142 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2145 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2147 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2148 team->t.t_threads[0], team));
2149 #elif KMP_BARRIER_ICV_PUSH
2152 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2153 team->t.t_threads[0], team));
2158 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2160 for (
int f = 1; f < new_nproc; ++f) {
2162 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2163 f, team->t.t_threads[f], team));
2164 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2165 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2166 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2167 f, team->t.t_threads[f], team));
2170 #endif // KMP_BARRIER_ICV_PULL