LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50  kmp_team_t *team = this_thr->th.th_team;
51  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52  kmp_info_t **other_threads = team->t.t_threads;
53 
54  KA_TRACE(
55  20,
56  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57  gtid, team->t.t_id, tid, bt));
58  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61  // Barrier imbalance - save arrive time to the thread
62  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64  __itt_get_timestamp();
65  }
66 #endif
67  // We now perform a linear reduction to signal that all of the threads have
68  // arrived.
69  if (!KMP_MASTER_TID(tid)) {
70  KA_TRACE(20,
71  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72  "arrived(%p): %llu => %llu\n",
73  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76  // Mark arrival to master thread
77  /* After performing this write, a worker thread may not assume that the team
78  is valid any more - it could be deallocated by the master thread at any
79  time. */
80  ANNOTATE_BARRIER_BEGIN(this_thr);
81  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
82  flag.release();
83  } else {
84  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85  int nproc = this_thr->th.th_team_nproc;
86  int i;
87  // Don't have to worry about sleep bit here or atomic since team setting
88  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90  // Collect all the worker team member threads.
91  for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93  // Prefetch next thread's arrived count
94  if (i + 1 < nproc)
95  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98  "arrived(%p) == %llu\n",
99  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100  team->t.t_id, i,
101  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103  // Wait for worker thread to arrive
104  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
105  new_state);
106  if (cancellable) {
107  bool cancelled = flag.wait_cancellable_nosleep(
108  this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109  if (cancelled)
110  return true;
111  } else {
112  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113  }
114  ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116  // Barrier imbalance - write min of the thread time and the other thread
117  // time to the thread.
118  if (__kmp_forkjoin_frames_mode == 2) {
119  this_thr->th.th_bar_min_time = KMP_MIN(
120  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121  }
122 #endif
123  if (reduce) {
124  KA_TRACE(100,
125  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127  team->t.t_id, i));
128  ANNOTATE_REDUCE_AFTER(reduce);
129  OMPT_REDUCTION_DECL(this_thr, gtid);
130  OMPT_REDUCTION_BEGIN;
131  (*reduce)(this_thr->th.th_local.reduce_data,
132  other_threads[i]->th.th_local.reduce_data);
133  OMPT_REDUCTION_END;
134  ANNOTATE_REDUCE_BEFORE(reduce);
135  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136  }
137  }
138  // Don't have to worry about sleep bit here or atomic since team setting
139  team_bar->b_arrived = new_state;
140  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141  "arrived(%p) = %llu\n",
142  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143  new_state));
144  }
145  KA_TRACE(
146  20,
147  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148  gtid, team->t.t_id, tid, bt));
149  return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158  kmp_team_t *team;
159 
160  if (KMP_MASTER_TID(tid)) {
161  unsigned int i;
162  kmp_uint32 nproc = this_thr->th.th_team_nproc;
163  kmp_info_t **other_threads;
164 
165  team = __kmp_threads[gtid]->th.th_team;
166  KMP_DEBUG_ASSERT(team != NULL);
167  other_threads = team->t.t_threads;
168 
169  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170  "barrier type %d\n",
171  gtid, team->t.t_id, tid, bt));
172 
173  if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175  {
176  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177  if (propagate_icvs) {
178  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179  for (i = 1; i < nproc; ++i) {
180  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181  team, i, FALSE);
182  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183  &team->t.t_implicit_task_taskdata[0].td_icvs);
184  }
185  ngo_sync();
186  }
187  }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190  // Now, release all of the worker threads
191  for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193  // Prefetch next thread's go flag
194  if (i + 1 < nproc)
195  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197  KA_TRACE(
198  20,
199  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200  "go(%p): %u => %u\n",
201  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203  other_threads[i]->th.th_bar[bt].bb.b_go,
204  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207  other_threads[i]);
208  flag.release();
209  }
210  }
211  } else { // Wait for the MASTER thread to release us
212  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215  if (cancellable) {
216  bool cancelled = flag.wait_cancellable_nosleep(
217  this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
218  if (cancelled) {
219  return true;
220  }
221  } else {
222  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223  }
224  ANNOTATE_BARRIER_END(this_thr);
225 #if USE_ITT_BUILD && USE_ITT_NOTIFY
226  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228  // disabled)
229  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230  // Cancel wait on previous parallel region...
231  __kmp_itt_task_starting(itt_sync_obj);
232 
233  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234  return false;
235 
236  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237  if (itt_sync_obj != NULL)
238  // Call prepare as early as possible for "new" barrier
239  __kmp_itt_task_finished(itt_sync_obj);
240  } else
241 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
242  // Early exit for reaping threads releasing forkjoin barrier
243  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244  return false;
245 // The worker thread may now assume that the team is valid.
246 #ifdef KMP_DEBUG
247  tid = __kmp_tid_from_gtid(gtid);
248  team = __kmp_threads[gtid]->th.th_team;
249 #endif
250  KMP_DEBUG_ASSERT(team != NULL);
251  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252  KA_TRACE(20,
253  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
255  KMP_MB(); // Flush all pending memory write invalidates.
256  }
257  KA_TRACE(
258  20,
259  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260  gtid, team->t.t_id, tid, bt));
261  return false;
262 }
263 
264 static void __kmp_linear_barrier_gather(
265  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
266  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
267  __kmp_linear_barrier_gather_template<false>(
268  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269 }
270 
271 static bool __kmp_linear_barrier_gather_cancellable(
272  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
273  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
274  return __kmp_linear_barrier_gather_template<true>(
275  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276 }
277 
278 static void __kmp_linear_barrier_release(
279  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
280  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
281  __kmp_linear_barrier_release_template<false>(
282  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283 }
284 
285 static bool __kmp_linear_barrier_release_cancellable(
286  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
287  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
288  return __kmp_linear_barrier_release_template<true>(
289  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
290 }
291 
292 // Tree barrier
293 static void
294 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
295  int tid, void (*reduce)(void *, void *)
296  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
297  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298  kmp_team_t *team = this_thr->th.th_team;
299  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300  kmp_info_t **other_threads = team->t.t_threads;
301  kmp_uint32 nproc = this_thr->th.th_team_nproc;
302  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303  kmp_uint32 branch_factor = 1 << branch_bits;
304  kmp_uint32 child;
305  kmp_uint32 child_tid;
306  kmp_uint64 new_state;
307 
308  KA_TRACE(
309  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310  gtid, team->t.t_id, tid, bt));
311  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312 
313 #if USE_ITT_BUILD && USE_ITT_NOTIFY
314  // Barrier imbalance - save arrive time to the thread
315  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317  __itt_get_timestamp();
318  }
319 #endif
320  // Perform tree gather to wait until all threads have arrived; reduce any
321  // required data as we go
322  child_tid = (tid << branch_bits) + 1;
323  if (child_tid < nproc) {
324  // Parent threads wait for all their children to arrive
325  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326  child = 1;
327  do {
328  kmp_info_t *child_thr = other_threads[child_tid];
329  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330 #if KMP_CACHE_MANAGE
331  // Prefetch next thread's arrived count
332  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333  KMP_CACHE_PREFETCH(
334  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335 #endif /* KMP_CACHE_MANAGE */
336  KA_TRACE(20,
337  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338  "arrived(%p) == %llu\n",
339  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341  // Wait for child to arrive
342  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344  ANNOTATE_BARRIER_END(child_thr);
345 #if USE_ITT_BUILD && USE_ITT_NOTIFY
346  // Barrier imbalance - write min of the thread time and a child time to
347  // the thread.
348  if (__kmp_forkjoin_frames_mode == 2) {
349  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350  child_thr->th.th_bar_min_time);
351  }
352 #endif
353  if (reduce) {
354  KA_TRACE(100,
355  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357  team->t.t_id, child_tid));
358  ANNOTATE_REDUCE_AFTER(reduce);
359  OMPT_REDUCTION_DECL(this_thr, gtid);
360  OMPT_REDUCTION_BEGIN;
361  (*reduce)(this_thr->th.th_local.reduce_data,
362  child_thr->th.th_local.reduce_data);
363  OMPT_REDUCTION_END;
364  ANNOTATE_REDUCE_BEFORE(reduce);
365  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366  }
367  child++;
368  child_tid++;
369  } while (child <= branch_factor && child_tid < nproc);
370  }
371 
372  if (!KMP_MASTER_TID(tid)) { // Worker threads
373  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374 
375  KA_TRACE(20,
376  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377  "arrived(%p): %llu => %llu\n",
378  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
381 
382  // Mark arrival to parent thread
383  /* After performing this write, a worker thread may not assume that the team
384  is valid any more - it could be deallocated by the master thread at any
385  time. */
386  ANNOTATE_BARRIER_BEGIN(this_thr);
387  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388  flag.release();
389  } else {
390  // Need to update the team arrived pointer if we are the master thread
391  if (nproc > 1) // New value was already computed above
392  team->t.t_bar[bt].b_arrived = new_state;
393  else
394  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396  "arrived(%p) = %llu\n",
397  gtid, team->t.t_id, tid, team->t.t_id,
398  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399  }
400  KA_TRACE(20,
401  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402  gtid, team->t.t_id, tid, bt));
403 }
404 
405 static void __kmp_tree_barrier_release(
406  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
407  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
408  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
409  kmp_team_t *team;
410  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
411  kmp_uint32 nproc;
412  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413  kmp_uint32 branch_factor = 1 << branch_bits;
414  kmp_uint32 child;
415  kmp_uint32 child_tid;
416 
417  // Perform a tree release for all of the threads that have been gathered
418  if (!KMP_MASTER_TID(
419  tid)) { // Handle fork barrier workers who aren't part of a team yet
420  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
422  // Wait for parent thread to release us
423  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425  ANNOTATE_BARRIER_END(this_thr);
426 #if USE_ITT_BUILD && USE_ITT_NOTIFY
427  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428  // In fork barrier where we could not get the object reliably (or
429  // ITTNOTIFY is disabled)
430  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
431  // Cancel wait on previous parallel region...
432  __kmp_itt_task_starting(itt_sync_obj);
433 
434  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435  return;
436 
437  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438  if (itt_sync_obj != NULL)
439  // Call prepare as early as possible for "new" barrier
440  __kmp_itt_task_finished(itt_sync_obj);
441  } else
442 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
443  // Early exit for reaping threads releasing forkjoin barrier
444  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445  return;
446 
447  // The worker thread may now assume that the team is valid.
448  team = __kmp_threads[gtid]->th.th_team;
449  KMP_DEBUG_ASSERT(team != NULL);
450  tid = __kmp_tid_from_gtid(gtid);
451 
452  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
453  KA_TRACE(20,
454  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456  KMP_MB(); // Flush all pending memory write invalidates.
457  } else {
458  team = __kmp_threads[gtid]->th.th_team;
459  KMP_DEBUG_ASSERT(team != NULL);
460  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
461  "barrier type %d\n",
462  gtid, team->t.t_id, tid, bt));
463  }
464  nproc = this_thr->th.th_team_nproc;
465  child_tid = (tid << branch_bits) + 1;
466 
467  if (child_tid < nproc) {
468  kmp_info_t **other_threads = team->t.t_threads;
469  child = 1;
470  // Parent threads release all their children
471  do {
472  kmp_info_t *child_thr = other_threads[child_tid];
473  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474 #if KMP_CACHE_MANAGE
475  // Prefetch next thread's go count
476  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
477  KMP_CACHE_PREFETCH(
478  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479 #endif /* KMP_CACHE_MANAGE */
480 
481 #if KMP_BARRIER_ICV_PUSH
482  {
483  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484  if (propagate_icvs) {
485  __kmp_init_implicit_task(team->t.t_ident,
486  team->t.t_threads[child_tid], team,
487  child_tid, FALSE);
488  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489  &team->t.t_implicit_task_taskdata[0].td_icvs);
490  }
491  }
492 #endif // KMP_BARRIER_ICV_PUSH
493  KA_TRACE(20,
494  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495  "go(%p): %u => %u\n",
496  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
499  // Release child from barrier
500  ANNOTATE_BARRIER_BEGIN(child_thr);
501  kmp_flag_64 flag(&child_bar->b_go, child_thr);
502  flag.release();
503  child++;
504  child_tid++;
505  } while (child <= branch_factor && child_tid < nproc);
506  }
507  KA_TRACE(
508  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509  gtid, team->t.t_id, tid, bt));
510 }
511 
512 // Hyper Barrier
513 static void
514 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
515  int tid, void (*reduce)(void *, void *)
516  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
517  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518  kmp_team_t *team = this_thr->th.th_team;
519  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520  kmp_info_t **other_threads = team->t.t_threads;
521  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524  kmp_uint32 branch_factor = 1 << branch_bits;
525  kmp_uint32 offset;
526  kmp_uint32 level;
527 
528  KA_TRACE(
529  20,
530  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531  gtid, team->t.t_id, tid, bt));
532  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
533 
534 #if USE_ITT_BUILD && USE_ITT_NOTIFY
535  // Barrier imbalance - save arrive time to the thread
536  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538  __itt_get_timestamp();
539  }
540 #endif
541  /* Perform a hypercube-embedded tree gather to wait until all of the threads
542  have arrived, and reduce any required data as we go. */
543  kmp_flag_64 p_flag(&thr_bar->b_arrived);
544  for (level = 0, offset = 1; offset < num_threads;
545  level += branch_bits, offset <<= branch_bits) {
546  kmp_uint32 child;
547  kmp_uint32 child_tid;
548 
549  if (((tid >> level) & (branch_factor - 1)) != 0) {
550  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
551 
552  KA_TRACE(20,
553  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
554  "arrived(%p): %llu => %llu\n",
555  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
556  team->t.t_id, parent_tid, &thr_bar->b_arrived,
557  thr_bar->b_arrived,
558  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
559  // Mark arrival to parent thread
560  /* After performing this write (in the last iteration of the enclosing for
561  loop), a worker thread may not assume that the team is valid any more
562  - it could be deallocated by the master thread at any time. */
563  ANNOTATE_BARRIER_BEGIN(this_thr);
564  p_flag.set_waiter(other_threads[parent_tid]);
565  p_flag.release();
566  break;
567  }
568 
569  // Parent threads wait for children to arrive
570  if (new_state == KMP_BARRIER_UNUSED_STATE)
571  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
572  for (child = 1, child_tid = tid + (1 << level);
573  child < branch_factor && child_tid < num_threads;
574  child++, child_tid += (1 << level)) {
575  kmp_info_t *child_thr = other_threads[child_tid];
576  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
577 #if KMP_CACHE_MANAGE
578  kmp_uint32 next_child_tid = child_tid + (1 << level);
579  // Prefetch next thread's arrived count
580  if (child + 1 < branch_factor && next_child_tid < num_threads)
581  KMP_CACHE_PREFETCH(
582  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
583 #endif /* KMP_CACHE_MANAGE */
584  KA_TRACE(20,
585  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
586  "arrived(%p) == %llu\n",
587  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
588  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
589  // Wait for child to arrive
590  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
591  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
592  ANNOTATE_BARRIER_END(child_thr);
593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
594  // Barrier imbalance - write min of the thread time and a child time to
595  // the thread.
596  if (__kmp_forkjoin_frames_mode == 2) {
597  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598  child_thr->th.th_bar_min_time);
599  }
600 #endif
601  if (reduce) {
602  KA_TRACE(100,
603  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605  team->t.t_id, child_tid));
606  ANNOTATE_REDUCE_AFTER(reduce);
607  OMPT_REDUCTION_DECL(this_thr, gtid);
608  OMPT_REDUCTION_BEGIN;
609  (*reduce)(this_thr->th.th_local.reduce_data,
610  child_thr->th.th_local.reduce_data);
611  OMPT_REDUCTION_END;
612  ANNOTATE_REDUCE_BEFORE(reduce);
613  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
614  }
615  }
616  }
617 
618  if (KMP_MASTER_TID(tid)) {
619  // Need to update the team arrived pointer if we are the master thread
620  if (new_state == KMP_BARRIER_UNUSED_STATE)
621  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
622  else
623  team->t.t_bar[bt].b_arrived = new_state;
624  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625  "arrived(%p) = %llu\n",
626  gtid, team->t.t_id, tid, team->t.t_id,
627  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
628  }
629  KA_TRACE(
630  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631  gtid, team->t.t_id, tid, bt));
632 }
633 
634 // The reverse versions seem to beat the forward versions overall
635 #define KMP_REVERSE_HYPER_BAR
636 static void __kmp_hyper_barrier_release(
637  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
638  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
639  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
640  kmp_team_t *team;
641  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642  kmp_info_t **other_threads;
643  kmp_uint32 num_threads;
644  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645  kmp_uint32 branch_factor = 1 << branch_bits;
646  kmp_uint32 child;
647  kmp_uint32 child_tid;
648  kmp_uint32 offset;
649  kmp_uint32 level;
650 
651  /* Perform a hypercube-embedded tree release for all of the threads that have
652  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
653  are released in the reverse order of the corresponding gather, otherwise
654  threads are released in the same order. */
655  if (KMP_MASTER_TID(tid)) { // master
656  team = __kmp_threads[gtid]->th.th_team;
657  KMP_DEBUG_ASSERT(team != NULL);
658  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
659  "barrier type %d\n",
660  gtid, team->t.t_id, tid, bt));
661 #if KMP_BARRIER_ICV_PUSH
662  if (propagate_icvs) { // master already has ICVs in final destination; copy
663  copy_icvs(&thr_bar->th_fixed_icvs,
664  &team->t.t_implicit_task_taskdata[tid].td_icvs);
665  }
666 #endif
667  } else { // Handle fork barrier workers who aren't part of a team yet
668  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
670  // Wait for parent thread to release us
671  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673  ANNOTATE_BARRIER_END(this_thr);
674 #if USE_ITT_BUILD && USE_ITT_NOTIFY
675  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
676  // In fork barrier where we could not get the object reliably
677  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
678  // Cancel wait on previous parallel region...
679  __kmp_itt_task_starting(itt_sync_obj);
680 
681  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
682  return;
683 
684  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685  if (itt_sync_obj != NULL)
686  // Call prepare as early as possible for "new" barrier
687  __kmp_itt_task_finished(itt_sync_obj);
688  } else
689 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
690  // Early exit for reaping threads releasing forkjoin barrier
691  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
692  return;
693 
694  // The worker thread may now assume that the team is valid.
695  team = __kmp_threads[gtid]->th.th_team;
696  KMP_DEBUG_ASSERT(team != NULL);
697  tid = __kmp_tid_from_gtid(gtid);
698 
699  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
700  KA_TRACE(20,
701  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
703  KMP_MB(); // Flush all pending memory write invalidates.
704  }
705  num_threads = this_thr->th.th_team_nproc;
706  other_threads = team->t.t_threads;
707 
708 #ifdef KMP_REVERSE_HYPER_BAR
709  // Count up to correct level for parent
710  for (level = 0, offset = 1;
711  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712  level += branch_bits, offset <<= branch_bits)
713  ;
714 
715  // Now go down from there
716  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717  level -= branch_bits, offset >>= branch_bits)
718 #else
719  // Go down the tree, level by level
720  for (level = 0, offset = 1; offset < num_threads;
721  level += branch_bits, offset <<= branch_bits)
722 #endif // KMP_REVERSE_HYPER_BAR
723  {
724 #ifdef KMP_REVERSE_HYPER_BAR
725  /* Now go in reverse order through the children, highest to lowest.
726  Initial setting of child is conservative here. */
727  child = num_threads >> ((level == 0) ? level : level - 1);
728  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729  child_tid = tid + (child << level);
730  child >= 1; child--, child_tid -= (1 << level))
731 #else
732  if (((tid >> level) & (branch_factor - 1)) != 0)
733  // No need to go lower than this, since this is the level parent would be
734  // notified
735  break;
736  // Iterate through children on this level of the tree
737  for (child = 1, child_tid = tid + (1 << level);
738  child < branch_factor && child_tid < num_threads;
739  child++, child_tid += (1 << level))
740 #endif // KMP_REVERSE_HYPER_BAR
741  {
742  if (child_tid >= num_threads)
743  continue; // Child doesn't exist so keep going
744  else {
745  kmp_info_t *child_thr = other_threads[child_tid];
746  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
747 #if KMP_CACHE_MANAGE
748  kmp_uint32 next_child_tid = child_tid - (1 << level);
749 // Prefetch next thread's go count
750 #ifdef KMP_REVERSE_HYPER_BAR
751  if (child - 1 >= 1 && next_child_tid < num_threads)
752 #else
753  if (child + 1 < branch_factor && next_child_tid < num_threads)
754 #endif // KMP_REVERSE_HYPER_BAR
755  KMP_CACHE_PREFETCH(
756  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
757 #endif /* KMP_CACHE_MANAGE */
758 
759 #if KMP_BARRIER_ICV_PUSH
760  if (propagate_icvs) // push my fixed ICVs to my child
761  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
762 #endif // KMP_BARRIER_ICV_PUSH
763 
764  KA_TRACE(
765  20,
766  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767  "go(%p): %u => %u\n",
768  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
771  // Release child from barrier
772  ANNOTATE_BARRIER_BEGIN(child_thr);
773  kmp_flag_64 flag(&child_bar->b_go, child_thr);
774  flag.release();
775  }
776  }
777  }
778 #if KMP_BARRIER_ICV_PUSH
779  if (propagate_icvs &&
780  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
781  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
782  FALSE);
783  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784  &thr_bar->th_fixed_icvs);
785  }
786 #endif
787  KA_TRACE(
788  20,
789  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790  gtid, team->t.t_id, tid, bt));
791 }
792 
793 // Hierarchical Barrier
794 
795 // Initialize thread barrier data
796 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
797  Performs the minimum amount of initialization required based on how the team
798  has changed. Returns true if leaf children will require both on-core and
799  traditional wake-up mechanisms. For example, if the team size increases,
800  threads already in the team will respond to on-core wakeup on their parent
801  thread, but threads newly added to the team will only be listening on the
802  their local b_go. */
803 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
804  kmp_bstate_t *thr_bar,
805  kmp_uint32 nproc, int gtid,
806  int tid, kmp_team_t *team) {
807  // Checks to determine if (re-)initialization is needed
808  bool uninitialized = thr_bar->team == NULL;
809  bool team_changed = team != thr_bar->team;
810  bool team_sz_changed = nproc != thr_bar->nproc;
811  bool tid_changed = tid != thr_bar->old_tid;
812  bool retval = false;
813 
814  if (uninitialized || team_sz_changed) {
815  __kmp_get_hierarchy(nproc, thr_bar);
816  }
817 
818  if (uninitialized || team_sz_changed || tid_changed) {
819  thr_bar->my_level = thr_bar->depth - 1; // default for master
820  thr_bar->parent_tid = -1; // default for master
821  if (!KMP_MASTER_TID(
822  tid)) { // if not master, find parent thread in hierarchy
823  kmp_uint32 d = 0;
824  while (d < thr_bar->depth) { // find parent based on level of thread in
825  // hierarchy, and note level
826  kmp_uint32 rem;
827  if (d == thr_bar->depth - 2) { // reached level right below the master
828  thr_bar->parent_tid = 0;
829  thr_bar->my_level = d;
830  break;
831  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
832  0) { // TODO: can we make this op faster?
833  // thread is not a subtree root at next level, so this is max
834  thr_bar->parent_tid = tid - rem;
835  thr_bar->my_level = d;
836  break;
837  }
838  ++d;
839  }
840  }
841  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
842  thr_bar->old_tid = tid;
843  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844  thr_bar->team = team;
845  thr_bar->parent_bar =
846  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847  }
848  if (uninitialized || team_changed || tid_changed) {
849  thr_bar->team = team;
850  thr_bar->parent_bar =
851  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
852  retval = true;
853  }
854  if (uninitialized || team_sz_changed || tid_changed) {
855  thr_bar->nproc = nproc;
856  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857  if (thr_bar->my_level == 0)
858  thr_bar->leaf_kids = 0;
859  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860  thr_bar->leaf_kids = nproc - tid - 1;
861  thr_bar->leaf_state = 0;
862  for (int i = 0; i < thr_bar->leaf_kids; ++i)
863  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
864  }
865  return retval;
866 }
867 
868 static void __kmp_hierarchical_barrier_gather(
869  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
870  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
871  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872  kmp_team_t *team = this_thr->th.th_team;
873  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874  kmp_uint32 nproc = this_thr->th.th_team_nproc;
875  kmp_info_t **other_threads = team->t.t_threads;
876  kmp_uint64 new_state;
877 
878  int level = team->t.t_level;
879  if (other_threads[0]
880  ->th.th_teams_microtask) // are we inside the teams construct?
881  if (this_thr->th.th_teams_size.nteams > 1)
882  ++level; // level was not increased in teams construct for team_of_masters
883  if (level == 1)
884  thr_bar->use_oncore_barrier = 1;
885  else
886  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
887 
888  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
889  "barrier type %d\n",
890  gtid, team->t.t_id, tid, bt));
891  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
892 
893 #if USE_ITT_BUILD && USE_ITT_NOTIFY
894  // Barrier imbalance - save arrive time to the thread
895  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
897  }
898 #endif
899 
900  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
901  team);
902 
903  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
904  kmp_int32 child_tid;
905  new_state =
906  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908  thr_bar->use_oncore_barrier) {
909  if (thr_bar->leaf_kids) {
910  // First, wait for leaf children to check-in on my b_arrived flag
911  kmp_uint64 leaf_state =
912  KMP_MASTER_TID(tid)
913  ? thr_bar->b_arrived | thr_bar->leaf_state
914  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
916  "for leaf kids\n",
917  gtid, team->t.t_id, tid));
918  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
919  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
920  if (reduce) {
921  ANNOTATE_REDUCE_AFTER(reduce);
922  OMPT_REDUCTION_DECL(this_thr, gtid);
923  OMPT_REDUCTION_BEGIN;
924  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
925  ++child_tid) {
926  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
927  "T#%d(%d:%d)\n",
928  gtid, team->t.t_id, tid,
929  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930  child_tid));
931  ANNOTATE_BARRIER_END(other_threads[child_tid]);
932  (*reduce)(this_thr->th.th_local.reduce_data,
933  other_threads[child_tid]->th.th_local.reduce_data);
934  }
935  OMPT_REDUCTION_END;
936  ANNOTATE_REDUCE_BEFORE(reduce);
937  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
938  }
939  // clear leaf_state bits
940  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
941  }
942  // Next, wait for higher level children on each child's b_arrived flag
943  for (kmp_uint32 d = 1; d < thr_bar->my_level;
944  ++d) { // gather lowest level threads first, but skip 0
945  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946  skip = thr_bar->skip_per_level[d];
947  if (last > nproc)
948  last = nproc;
949  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950  kmp_info_t *child_thr = other_threads[child_tid];
951  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
953  "T#%d(%d:%d) "
954  "arrived(%p) == %llu\n",
955  gtid, team->t.t_id, tid,
956  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957  child_tid, &child_bar->b_arrived, new_state));
958  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
959  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960  ANNOTATE_BARRIER_END(child_thr);
961  if (reduce) {
962  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
963  "T#%d(%d:%d)\n",
964  gtid, team->t.t_id, tid,
965  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
966  child_tid));
967  ANNOTATE_REDUCE_AFTER(reduce);
968  (*reduce)(this_thr->th.th_local.reduce_data,
969  child_thr->th.th_local.reduce_data);
970  ANNOTATE_REDUCE_BEFORE(reduce);
971  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
972  }
973  }
974  }
975  } else { // Blocktime is not infinite
976  for (kmp_uint32 d = 0; d < thr_bar->my_level;
977  ++d) { // Gather lowest level threads first
978  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979  skip = thr_bar->skip_per_level[d];
980  if (last > nproc)
981  last = nproc;
982  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983  kmp_info_t *child_thr = other_threads[child_tid];
984  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
986  "T#%d(%d:%d) "
987  "arrived(%p) == %llu\n",
988  gtid, team->t.t_id, tid,
989  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990  child_tid, &child_bar->b_arrived, new_state));
991  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
992  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993  ANNOTATE_BARRIER_END(child_thr);
994  if (reduce) {
995  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
996  "T#%d(%d:%d)\n",
997  gtid, team->t.t_id, tid,
998  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
999  child_tid));
1000  ANNOTATE_REDUCE_AFTER(reduce);
1001  (*reduce)(this_thr->th.th_local.reduce_data,
1002  child_thr->th.th_local.reduce_data);
1003  ANNOTATE_REDUCE_BEFORE(reduce);
1004  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1005  }
1006  }
1007  }
1008  }
1009  }
1010  // All subordinates are gathered; now release parent if not master thread
1011 
1012  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1013  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1014  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015  gtid, team->t.t_id, tid,
1016  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1019  /* Mark arrival to parent: After performing this write, a worker thread may
1020  not assume that the team is valid any more - it could be deallocated by
1021  the master thread at any time. */
1022  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1024  // flag; release it
1025  ANNOTATE_BARRIER_BEGIN(this_thr);
1026  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1027  flag.release();
1028  } else {
1029  // Leaf does special release on "offset" bits of parent's b_arrived flag
1030  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1031  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1032  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1033  flag.release();
1034  }
1035  } else { // Master thread needs to update the team's b_arrived value
1036  team->t.t_bar[bt].b_arrived = new_state;
1037  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1038  "arrived(%p) = %llu\n",
1039  gtid, team->t.t_id, tid, team->t.t_id,
1040  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1041  }
1042  // Is the team access below unsafe or just technically invalid?
1043  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1044  "barrier type %d\n",
1045  gtid, team->t.t_id, tid, bt));
1046 }
1047 
1048 static void __kmp_hierarchical_barrier_release(
1049  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1050  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1051  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1052  kmp_team_t *team;
1053  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1054  kmp_uint32 nproc;
1055  bool team_change = false; // indicates on-core barrier shouldn't be used
1056 
1057  if (KMP_MASTER_TID(tid)) {
1058  team = __kmp_threads[gtid]->th.th_team;
1059  KMP_DEBUG_ASSERT(team != NULL);
1060  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1061  "entered barrier type %d\n",
1062  gtid, team->t.t_id, tid, bt));
1063  } else { // Worker threads
1064  // Wait for parent thread to release me
1065  if (!thr_bar->use_oncore_barrier ||
1066  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1067  thr_bar->team == NULL) {
1068  // Use traditional method of waiting on my own b_go flag
1069  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1070  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1071  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1072  ANNOTATE_BARRIER_END(this_thr);
1073  TCW_8(thr_bar->b_go,
1074  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1075  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1076  // infinite, not nested
1077  // Wait on my "offset" bits on parent's b_go flag
1078  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1079  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1080  thr_bar->offset, bt,
1081  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1082  flag.wait(this_thr, TRUE);
1083  if (thr_bar->wait_flag ==
1084  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1085  TCW_8(thr_bar->b_go,
1086  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1087  } else { // Reset my bits on parent's b_go flag
1088  (RCAST(volatile char *,
1089  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1090  }
1091  }
1092  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1093  // Early exit for reaping threads releasing forkjoin barrier
1094  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1095  return;
1096  // The worker thread may now assume that the team is valid.
1097  team = __kmp_threads[gtid]->th.th_team;
1098  KMP_DEBUG_ASSERT(team != NULL);
1099  tid = __kmp_tid_from_gtid(gtid);
1100 
1101  KA_TRACE(
1102  20,
1103  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1104  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1105  KMP_MB(); // Flush all pending memory write invalidates.
1106  }
1107 
1108  nproc = this_thr->th.th_team_nproc;
1109  int level = team->t.t_level;
1110  if (team->t.t_threads[0]
1111  ->th.th_teams_microtask) { // are we inside the teams construct?
1112  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1113  this_thr->th.th_teams_level == level)
1114  ++level; // level was not increased in teams construct for team_of_workers
1115  if (this_thr->th.th_teams_size.nteams > 1)
1116  ++level; // level was not increased in teams construct for team_of_masters
1117  }
1118  if (level == 1)
1119  thr_bar->use_oncore_barrier = 1;
1120  else
1121  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1122 
1123  // If the team size has increased, we still communicate with old leaves via
1124  // oncore barrier.
1125  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1126  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1127  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1128  tid, team);
1129  // But if the entire team changes, we won't use oncore barrier at all
1130  if (team_change)
1131  old_leaf_kids = 0;
1132 
1133 #if KMP_BARRIER_ICV_PUSH
1134  if (propagate_icvs) {
1135  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1136  FALSE);
1137  if (KMP_MASTER_TID(
1138  tid)) { // master already has copy in final destination; copy
1139  copy_icvs(&thr_bar->th_fixed_icvs,
1140  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1141  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1142  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1143  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1144  // leaves (on-core children) pull parent's fixed ICVs directly to local
1145  // ICV store
1146  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1147  &thr_bar->parent_bar->th_fixed_icvs);
1148  // non-leaves will get ICVs piggybacked with b_go via NGO store
1149  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1150  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1151  // access
1152  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1153  else // leaves copy parent's fixed ICVs directly to local ICV store
1154  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1155  &thr_bar->parent_bar->th_fixed_icvs);
1156  }
1157  }
1158 #endif // KMP_BARRIER_ICV_PUSH
1159 
1160  // Now, release my children
1161  if (thr_bar->my_level) { // not a leaf
1162  kmp_int32 child_tid;
1163  kmp_uint32 last;
1164  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1165  thr_bar->use_oncore_barrier) {
1166  if (KMP_MASTER_TID(tid)) { // do a flat release
1167  // Set local b_go to bump children via NGO store of the cache line
1168  // containing IVCs and b_go.
1169  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1170  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1171  // the cache line
1172  ngo_load(&thr_bar->th_fixed_icvs);
1173  // This loops over all the threads skipping only the leaf nodes in the
1174  // hierarchy
1175  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1176  child_tid += thr_bar->skip_per_level[1]) {
1177  kmp_bstate_t *child_bar =
1178  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1179  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1180  "releasing T#%d(%d:%d)"
1181  " go(%p): %u => %u\n",
1182  gtid, team->t.t_id, tid,
1183  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1184  child_tid, &child_bar->b_go, child_bar->b_go,
1185  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1186  // Use ngo store (if available) to both store ICVs and release child
1187  // via child's b_go
1188  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1189  }
1190  ngo_sync();
1191  }
1192  TCW_8(thr_bar->b_go,
1193  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1194  // Now, release leaf children
1195  if (thr_bar->leaf_kids) { // if there are any
1196  // We test team_change on the off-chance that the level 1 team changed.
1197  if (team_change ||
1198  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1199  if (old_leaf_kids) { // release old leaf kids
1200  thr_bar->b_go |= old_leaf_state;
1201  }
1202  // Release new leaf kids
1203  last = tid + thr_bar->skip_per_level[1];
1204  if (last > nproc)
1205  last = nproc;
1206  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1207  ++child_tid) { // skip_per_level[0]=1
1208  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1209  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1210  KA_TRACE(
1211  20,
1212  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1213  " T#%d(%d:%d) go(%p): %u => %u\n",
1214  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1215  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1216  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1217  // Release child using child's b_go flag
1218  ANNOTATE_BARRIER_BEGIN(child_thr);
1219  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1220  flag.release();
1221  }
1222  } else { // Release all children at once with leaf_state bits on my own
1223  // b_go flag
1224  thr_bar->b_go |= thr_bar->leaf_state;
1225  }
1226  }
1227  } else { // Blocktime is not infinite; do a simple hierarchical release
1228  for (int d = thr_bar->my_level - 1; d >= 0;
1229  --d) { // Release highest level threads first
1230  last = tid + thr_bar->skip_per_level[d + 1];
1231  kmp_uint32 skip = thr_bar->skip_per_level[d];
1232  if (last > nproc)
1233  last = nproc;
1234  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1235  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1236  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1237  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1238  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1239  gtid, team->t.t_id, tid,
1240  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1241  child_tid, &child_bar->b_go, child_bar->b_go,
1242  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1243  // Release child using child's b_go flag
1244  ANNOTATE_BARRIER_BEGIN(child_thr);
1245  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1246  flag.release();
1247  }
1248  }
1249  }
1250 #if KMP_BARRIER_ICV_PUSH
1251  if (propagate_icvs && !KMP_MASTER_TID(tid))
1252  // non-leaves copy ICVs from fixed ICVs to local dest
1253  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1254  &thr_bar->th_fixed_icvs);
1255 #endif // KMP_BARRIER_ICV_PUSH
1256  }
1257  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1258  "barrier type %d\n",
1259  gtid, team->t.t_id, tid, bt));
1260 }
1261 
1262 // End of Barrier Algorithms
1263 
1264 // type traits for cancellable value
1265 // if cancellable is true, then is_cancellable is a normal boolean variable
1266 // if cancellable is false, then is_cancellable is a compile time constant
1267 template <bool cancellable> struct is_cancellable {};
1268 template <> struct is_cancellable<true> {
1269  bool value;
1270  is_cancellable() : value(false) {}
1271  is_cancellable(bool b) : value(b) {}
1272  is_cancellable &operator=(bool b) {
1273  value = b;
1274  return *this;
1275  }
1276  operator bool() const { return value; }
1277 };
1278 template <> struct is_cancellable<false> {
1279  is_cancellable &operator=(bool b) { return *this; }
1280  constexpr operator bool() const { return false; }
1281 };
1282 
1283 // Internal function to do a barrier.
1284 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1285  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1286  barrier
1287  When cancellable = false,
1288  Returns 0 if master thread, 1 if worker thread.
1289  When cancellable = true
1290  Returns 0 if not cancelled, 1 if cancelled. */
1291 template <bool cancellable = false>
1292 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1293  size_t reduce_size, void *reduce_data,
1294  void (*reduce)(void *, void *)) {
1295  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1296  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1297  int tid = __kmp_tid_from_gtid(gtid);
1298  kmp_info_t *this_thr = __kmp_threads[gtid];
1299  kmp_team_t *team = this_thr->th.th_team;
1300  int status = 0;
1301  is_cancellable<cancellable> cancelled;
1302 #if OMPT_SUPPORT && OMPT_OPTIONAL
1303  ompt_data_t *my_task_data;
1304  ompt_data_t *my_parallel_data;
1305  void *return_address;
1306  ompt_sync_region_t barrier_kind;
1307 #endif
1308 
1309  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1310  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1311 
1312  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1313 #if OMPT_SUPPORT
1314  if (ompt_enabled.enabled) {
1315 #if OMPT_OPTIONAL
1316  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1317  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1318  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1319  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1320  if (ompt_enabled.ompt_callback_sync_region) {
1321  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1322  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1323  return_address);
1324  }
1325  if (ompt_enabled.ompt_callback_sync_region_wait) {
1326  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1327  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1328  return_address);
1329  }
1330 #endif
1331  // It is OK to report the barrier state after the barrier begin callback.
1332  // According to the OMPT specification, a compliant implementation may
1333  // even delay reporting this state until the barrier begins to wait.
1334  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1335  }
1336 #endif
1337 
1338  if (!team->t.t_serialized) {
1339 #if USE_ITT_BUILD
1340  // This value will be used in itt notify events below.
1341  void *itt_sync_obj = NULL;
1342 #if USE_ITT_NOTIFY
1343  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1344  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1345 #endif
1346 #endif /* USE_ITT_BUILD */
1347  if (__kmp_tasking_mode == tskm_extra_barrier) {
1348  __kmp_tasking_barrier(team, this_thr, gtid);
1349  KA_TRACE(15,
1350  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1351  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1352  }
1353 
1354  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1355  access it when the team struct is not guaranteed to exist. */
1356  // See note about the corresponding code in __kmp_join_barrier() being
1357  // performance-critical.
1358  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1359 #if KMP_USE_MONITOR
1360  this_thr->th.th_team_bt_intervals =
1361  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1362  this_thr->th.th_team_bt_set =
1363  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1364 #else
1365  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1366 #endif
1367  }
1368 
1369 #if USE_ITT_BUILD
1370  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1371  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1372 #endif /* USE_ITT_BUILD */
1373 #if USE_DEBUGGER
1374  // Let the debugger know: the thread arrived to the barrier and waiting.
1375  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1376  team->t.t_bar[bt].b_master_arrived += 1;
1377  } else {
1378  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1379  } // if
1380 #endif /* USE_DEBUGGER */
1381  if (reduce != NULL) {
1382  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1383  this_thr->th.th_local.reduce_data = reduce_data;
1384  }
1385 
1386  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1387  // use 0 to only setup the current team if nthreads > 1
1388  __kmp_task_team_setup(this_thr, team, 0);
1389 
1390  if (cancellable) {
1391  cancelled = __kmp_linear_barrier_gather_cancellable(
1392  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1393  } else {
1394  switch (__kmp_barrier_gather_pattern[bt]) {
1395  case bp_hyper_bar: {
1396  // don't set branch bits to 0; use linear
1397  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1398  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1399  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1400  break;
1401  }
1402  case bp_hierarchical_bar: {
1403  __kmp_hierarchical_barrier_gather(
1404  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1405  break;
1406  }
1407  case bp_tree_bar: {
1408  // don't set branch bits to 0; use linear
1409  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1410  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1411  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1412  break;
1413  }
1414  default: {
1415  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1416  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1417  }
1418  }
1419  }
1420 
1421  KMP_MB();
1422 
1423  if (KMP_MASTER_TID(tid)) {
1424  status = 0;
1425  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1426  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1427  }
1428 #if USE_DEBUGGER
1429  // Let the debugger know: All threads are arrived and starting leaving the
1430  // barrier.
1431  team->t.t_bar[bt].b_team_arrived += 1;
1432 #endif
1433 
1434  if (__kmp_omp_cancellation) {
1435  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1436  // Reset cancellation flag for worksharing constructs
1437  if (cancel_request == cancel_loop ||
1438  cancel_request == cancel_sections) {
1439  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1440  }
1441  }
1442 #if USE_ITT_BUILD
1443  /* TODO: In case of split reduction barrier, master thread may send
1444  acquired event early, before the final summation into the shared
1445  variable is done (final summation can be a long operation for array
1446  reductions). */
1447  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1448  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1449 #endif /* USE_ITT_BUILD */
1450 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1451  // Barrier - report frame end (only if active_level == 1)
1452  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1453  __kmp_forkjoin_frames_mode &&
1454  this_thr->th.th_teams_microtask == NULL &&
1455  team->t.t_active_level == 1) {
1456  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1457  kmp_uint64 cur_time = __itt_get_timestamp();
1458  kmp_info_t **other_threads = team->t.t_threads;
1459  int nproc = this_thr->th.th_team_nproc;
1460  int i;
1461  switch (__kmp_forkjoin_frames_mode) {
1462  case 1:
1463  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1464  loc, nproc);
1465  this_thr->th.th_frame_time = cur_time;
1466  break;
1467  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1468  // be fixed)
1469  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1470  1, loc, nproc);
1471  break;
1472  case 3:
1473  if (__itt_metadata_add_ptr) {
1474  // Initialize with master's wait time
1475  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1476  // Set arrive time to zero to be able to check it in
1477  // __kmp_invoke_task(); the same is done inside the loop below
1478  this_thr->th.th_bar_arrive_time = 0;
1479  for (i = 1; i < nproc; ++i) {
1480  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1481  other_threads[i]->th.th_bar_arrive_time = 0;
1482  }
1483  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1484  cur_time, delta,
1485  (kmp_uint64)(reduce != NULL));
1486  }
1487  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1488  loc, nproc);
1489  this_thr->th.th_frame_time = cur_time;
1490  break;
1491  }
1492  }
1493 #endif /* USE_ITT_BUILD */
1494  } else {
1495  status = 1;
1496 #if USE_ITT_BUILD
1497  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1498  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1499 #endif /* USE_ITT_BUILD */
1500  }
1501  if ((status == 1 || !is_split) && !cancelled) {
1502  if (cancellable) {
1503  cancelled = __kmp_linear_barrier_release_cancellable(
1504  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1505  } else {
1506  switch (__kmp_barrier_release_pattern[bt]) {
1507  case bp_hyper_bar: {
1508  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1509  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1510  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1511  break;
1512  }
1513  case bp_hierarchical_bar: {
1514  __kmp_hierarchical_barrier_release(
1515  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1516  break;
1517  }
1518  case bp_tree_bar: {
1519  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1520  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1521  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1522  break;
1523  }
1524  default: {
1525  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1526  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1527  }
1528  }
1529  }
1530  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1531  __kmp_task_team_sync(this_thr, team);
1532  }
1533  }
1534 
1535 #if USE_ITT_BUILD
1536  /* GEH: TODO: Move this under if-condition above and also include in
1537  __kmp_end_split_barrier(). This will more accurately represent the actual
1538  release time of the threads for split barriers. */
1539  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1540  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1541 #endif /* USE_ITT_BUILD */
1542  } else { // Team is serialized.
1543  status = 0;
1544  if (__kmp_tasking_mode != tskm_immediate_exec) {
1545  if (this_thr->th.th_task_team != NULL) {
1546 #if USE_ITT_NOTIFY
1547  void *itt_sync_obj = NULL;
1548  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1549  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1550  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1551  }
1552 #endif
1553 
1554  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1555  TRUE);
1556  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1557  __kmp_task_team_setup(this_thr, team, 0);
1558 
1559 #if USE_ITT_BUILD
1560  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1561  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1562 #endif /* USE_ITT_BUILD */
1563  }
1564  }
1565  }
1566  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1567  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1568  __kmp_tid_from_gtid(gtid), status));
1569 
1570 #if OMPT_SUPPORT
1571  if (ompt_enabled.enabled) {
1572 #if OMPT_OPTIONAL
1573  if (ompt_enabled.ompt_callback_sync_region_wait) {
1574  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1575  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1576  return_address);
1577  }
1578  if (ompt_enabled.ompt_callback_sync_region) {
1579  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1580  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1581  return_address);
1582  }
1583 #endif
1584  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1585  }
1586 #endif
1587  ANNOTATE_BARRIER_END(&team->t.t_bar);
1588 
1589  if (cancellable)
1590  return (int)cancelled;
1591  return status;
1592 }
1593 
1594 // Returns 0 if master thread, 1 if worker thread.
1595 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1596  size_t reduce_size, void *reduce_data,
1597  void (*reduce)(void *, void *)) {
1598  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1599  reduce);
1600 }
1601 
1602 #if defined(KMP_GOMP_COMPAT)
1603 // Returns 1 if cancelled, 0 otherwise
1604 int __kmp_barrier_gomp_cancel(int gtid) {
1605  if (__kmp_omp_cancellation) {
1606  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1607  0, NULL, NULL);
1608  if (cancelled) {
1609  int tid = __kmp_tid_from_gtid(gtid);
1610  kmp_info_t *this_thr = __kmp_threads[gtid];
1611  if (KMP_MASTER_TID(tid)) {
1612  // Master does not need to revert anything
1613  } else {
1614  // Workers need to revert their private b_arrived flag
1615  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1616  KMP_BARRIER_STATE_BUMP;
1617  }
1618  }
1619  return cancelled;
1620  }
1621  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1622  return FALSE;
1623 }
1624 #endif
1625 
1626 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1627  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1628  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1629  int tid = __kmp_tid_from_gtid(gtid);
1630  kmp_info_t *this_thr = __kmp_threads[gtid];
1631  kmp_team_t *team = this_thr->th.th_team;
1632 
1633  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1634  if (!team->t.t_serialized) {
1635  if (KMP_MASTER_GTID(gtid)) {
1636  switch (__kmp_barrier_release_pattern[bt]) {
1637  case bp_hyper_bar: {
1638  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1639  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1640  FALSE USE_ITT_BUILD_ARG(NULL));
1641  break;
1642  }
1643  case bp_hierarchical_bar: {
1644  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1645  FALSE USE_ITT_BUILD_ARG(NULL));
1646  break;
1647  }
1648  case bp_tree_bar: {
1649  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1650  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1651  FALSE USE_ITT_BUILD_ARG(NULL));
1652  break;
1653  }
1654  default: {
1655  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1656  FALSE USE_ITT_BUILD_ARG(NULL));
1657  }
1658  }
1659  if (__kmp_tasking_mode != tskm_immediate_exec) {
1660  __kmp_task_team_sync(this_thr, team);
1661  } // if
1662  }
1663  }
1664  ANNOTATE_BARRIER_END(&team->t.t_bar);
1665 }
1666 
1667 void __kmp_join_barrier(int gtid) {
1668  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1669  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1670  kmp_info_t *this_thr = __kmp_threads[gtid];
1671  kmp_team_t *team;
1672  kmp_uint nproc;
1673  kmp_info_t *master_thread;
1674  int tid;
1675 #ifdef KMP_DEBUG
1676  int team_id;
1677 #endif /* KMP_DEBUG */
1678 #if USE_ITT_BUILD
1679  void *itt_sync_obj = NULL;
1680 #if USE_ITT_NOTIFY
1681  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1682  // Get object created at fork_barrier
1683  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1684 #endif
1685 #endif /* USE_ITT_BUILD */
1686  KMP_MB();
1687 
1688  // Get current info
1689  team = this_thr->th.th_team;
1690  nproc = this_thr->th.th_team_nproc;
1691  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1692  tid = __kmp_tid_from_gtid(gtid);
1693 #ifdef KMP_DEBUG
1694  team_id = team->t.t_id;
1695 #endif /* KMP_DEBUG */
1696  master_thread = this_thr->th.th_team_master;
1697 #ifdef KMP_DEBUG
1698  if (master_thread != team->t.t_threads[0]) {
1699  __kmp_print_structure();
1700  }
1701 #endif /* KMP_DEBUG */
1702  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1703  KMP_MB();
1704 
1705  // Verify state
1706  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1707  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1708  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1709  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1710  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1711  gtid, team_id, tid));
1712 
1713  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1714 #if OMPT_SUPPORT
1715  if (ompt_enabled.enabled) {
1716 #if OMPT_OPTIONAL
1717  ompt_data_t *my_task_data;
1718  ompt_data_t *my_parallel_data;
1719  void *codeptr = NULL;
1720  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1721  if (KMP_MASTER_TID(ds_tid) &&
1722  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1723  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1724  codeptr = team->t.ompt_team_info.master_return_address;
1725  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1726  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1727  if (ompt_enabled.ompt_callback_sync_region) {
1728  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1729  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1730  my_task_data, codeptr);
1731  }
1732  if (ompt_enabled.ompt_callback_sync_region_wait) {
1733  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1734  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1735  my_task_data, codeptr);
1736  }
1737  if (!KMP_MASTER_TID(ds_tid))
1738  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1739 #endif
1740  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1741  }
1742 #endif
1743 
1744  if (__kmp_tasking_mode == tskm_extra_barrier) {
1745  __kmp_tasking_barrier(team, this_thr, gtid);
1746  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1747  team_id, tid));
1748  }
1749 #ifdef KMP_DEBUG
1750  if (__kmp_tasking_mode != tskm_immediate_exec) {
1751  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1752  "%p, th_task_team = %p\n",
1753  __kmp_gtid_from_thread(this_thr), team_id,
1754  team->t.t_task_team[this_thr->th.th_task_state],
1755  this_thr->th.th_task_team));
1756  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1757  team->t.t_task_team[this_thr->th.th_task_state]);
1758  }
1759 #endif /* KMP_DEBUG */
1760 
1761  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1762  access it when the team struct is not guaranteed to exist. Doing these
1763  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1764  we do not perform the copy if blocktime=infinite, since the values are not
1765  used by __kmp_wait_template() in that case. */
1766  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1767 #if KMP_USE_MONITOR
1768  this_thr->th.th_team_bt_intervals =
1769  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1770  this_thr->th.th_team_bt_set =
1771  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1772 #else
1773  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1774 #endif
1775  }
1776 
1777 #if USE_ITT_BUILD
1778  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1779  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1780 #endif /* USE_ITT_BUILD */
1781 
1782  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1783  case bp_hyper_bar: {
1784  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1785  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1786  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1787  break;
1788  }
1789  case bp_hierarchical_bar: {
1790  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1791  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1792  break;
1793  }
1794  case bp_tree_bar: {
1795  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1796  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1797  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1798  break;
1799  }
1800  default: {
1801  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1802  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1803  }
1804  }
1805 
1806  /* From this point on, the team data structure may be deallocated at any time
1807  by the master thread - it is unsafe to reference it in any of the worker
1808  threads. Any per-team data items that need to be referenced before the
1809  end of the barrier should be moved to the kmp_task_team_t structs. */
1810  if (KMP_MASTER_TID(tid)) {
1811  if (__kmp_tasking_mode != tskm_immediate_exec) {
1812  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1813  }
1814  if (__kmp_display_affinity) {
1815  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1816  }
1817 #if KMP_STATS_ENABLED
1818  // Have master thread flag the workers to indicate they are now waiting for
1819  // next parallel region, Also wake them up so they switch their timers to
1820  // idle.
1821  for (int i = 0; i < team->t.t_nproc; ++i) {
1822  kmp_info_t *team_thread = team->t.t_threads[i];
1823  if (team_thread == this_thr)
1824  continue;
1825  team_thread->th.th_stats->setIdleFlag();
1826  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1827  team_thread->th.th_sleep_loc != NULL)
1828  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1829  team_thread->th.th_sleep_loc);
1830  }
1831 #endif
1832 #if USE_ITT_BUILD
1833  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1834  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1835 #endif /* USE_ITT_BUILD */
1836 
1837 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1838  // Join barrier - report frame end
1839  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1840  __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1841  team->t.t_active_level == 1) {
1842  kmp_uint64 cur_time = __itt_get_timestamp();
1843  ident_t *loc = team->t.t_ident;
1844  kmp_info_t **other_threads = team->t.t_threads;
1845  int nproc = this_thr->th.th_team_nproc;
1846  int i;
1847  switch (__kmp_forkjoin_frames_mode) {
1848  case 1:
1849  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1850  loc, nproc);
1851  break;
1852  case 2:
1853  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1854  loc, nproc);
1855  break;
1856  case 3:
1857  if (__itt_metadata_add_ptr) {
1858  // Initialize with master's wait time
1859  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1860  // Set arrive time to zero to be able to check it in
1861  // __kmp_invoke_task(); the same is done inside the loop below
1862  this_thr->th.th_bar_arrive_time = 0;
1863  for (i = 1; i < nproc; ++i) {
1864  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1865  other_threads[i]->th.th_bar_arrive_time = 0;
1866  }
1867  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1868  cur_time, delta, 0);
1869  }
1870  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1871  loc, nproc);
1872  this_thr->th.th_frame_time = cur_time;
1873  break;
1874  }
1875  }
1876 #endif /* USE_ITT_BUILD */
1877  }
1878 #if USE_ITT_BUILD
1879  else {
1880  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1881  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1882  }
1883 #endif /* USE_ITT_BUILD */
1884 
1885 #if KMP_DEBUG
1886  if (KMP_MASTER_TID(tid)) {
1887  KA_TRACE(
1888  15,
1889  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1890  gtid, team_id, tid, nproc));
1891  }
1892 #endif /* KMP_DEBUG */
1893 
1894  // TODO now, mark worker threads as done so they may be disbanded
1895  KMP_MB(); // Flush all pending memory write invalidates.
1896  KA_TRACE(10,
1897  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1898 
1899  ANNOTATE_BARRIER_END(&team->t.t_bar);
1900 }
1901 
1902 // TODO release worker threads' fork barriers as we are ready instead of all at
1903 // once
1904 void __kmp_fork_barrier(int gtid, int tid) {
1905  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1906  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1907  kmp_info_t *this_thr = __kmp_threads[gtid];
1908  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1909 #if USE_ITT_BUILD
1910  void *itt_sync_obj = NULL;
1911 #endif /* USE_ITT_BUILD */
1912  if (team)
1913  ANNOTATE_BARRIER_END(&team->t.t_bar);
1914 
1915  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1916  (team != NULL) ? team->t.t_id : -1, tid));
1917 
1918  // th_team pointer only valid for master thread here
1919  if (KMP_MASTER_TID(tid)) {
1920 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1921  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1922  // Create itt barrier object
1923  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1924  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1925  }
1926 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1927 
1928 #ifdef KMP_DEBUG
1929  kmp_info_t **other_threads = team->t.t_threads;
1930  int i;
1931 
1932  // Verify state
1933  KMP_MB();
1934 
1935  for (i = 1; i < team->t.t_nproc; ++i) {
1936  KA_TRACE(500,
1937  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1938  "== %u.\n",
1939  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1940  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1941  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1942  KMP_DEBUG_ASSERT(
1943  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1944  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1945  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1946  }
1947 #endif
1948 
1949  if (__kmp_tasking_mode != tskm_immediate_exec) {
1950  // 0 indicates setup current task team if nthreads > 1
1951  __kmp_task_team_setup(this_thr, team, 0);
1952  }
1953 
1954  /* The master thread may have changed its blocktime between the join barrier
1955  and the fork barrier. Copy the blocktime info to the thread, where
1956  __kmp_wait_template() can access it when the team struct is not
1957  guaranteed to exist. */
1958  // See note about the corresponding code in __kmp_join_barrier() being
1959  // performance-critical
1960  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1961 #if KMP_USE_MONITOR
1962  this_thr->th.th_team_bt_intervals =
1963  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1964  this_thr->th.th_team_bt_set =
1965  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1966 #else
1967  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1968 #endif
1969  }
1970  } // master
1971 
1972  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1973  case bp_hyper_bar: {
1974  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1975  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1976  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1977  break;
1978  }
1979  case bp_hierarchical_bar: {
1980  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1981  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1982  break;
1983  }
1984  case bp_tree_bar: {
1985  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1986  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1987  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1988  break;
1989  }
1990  default: {
1991  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1992  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1993  }
1994  }
1995 
1996 #if OMPT_SUPPORT
1997  if (ompt_enabled.enabled &&
1998  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1999  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2000  ompt_data_t *task_data = (team)
2001  ? OMPT_CUR_TASK_DATA(this_thr)
2002  : &(this_thr->th.ompt_thread_info.task_data);
2003  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2004 #if OMPT_OPTIONAL
2005  void *codeptr = NULL;
2006  if (KMP_MASTER_TID(ds_tid) &&
2007  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2008  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2009  codeptr = team->t.ompt_team_info.master_return_address;
2010  if (ompt_enabled.ompt_callback_sync_region_wait) {
2011  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2012  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2013  codeptr);
2014  }
2015  if (ompt_enabled.ompt_callback_sync_region) {
2016  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2017  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2018  codeptr);
2019  }
2020 #endif
2021  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2022  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2023  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2024  }
2025  }
2026 #endif
2027 
2028  // Early exit for reaping threads releasing forkjoin barrier
2029  if (TCR_4(__kmp_global.g.g_done)) {
2030  this_thr->th.th_task_team = NULL;
2031 
2032 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2033  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2034  if (!KMP_MASTER_TID(tid)) {
2035  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2036  if (itt_sync_obj)
2037  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2038  }
2039  }
2040 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2041  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2042  return;
2043  }
2044 
2045  /* We can now assume that a valid team structure has been allocated by the
2046  master and propagated to all worker threads. The current thread, however,
2047  may not be part of the team, so we can't blindly assume that the team
2048  pointer is non-null. */
2049  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2050  KMP_DEBUG_ASSERT(team != NULL);
2051  tid = __kmp_tid_from_gtid(gtid);
2052 
2053 #if KMP_BARRIER_ICV_PULL
2054  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2055  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2056  implicit task has this data before this function is called. We cannot
2057  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2058  struct, because it is not always the case that the threads arrays have
2059  been allocated when __kmp_fork_call() is executed. */
2060  {
2061  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2062  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2063  // Copy the initial ICVs from the master's thread struct to the implicit
2064  // task for this tid.
2065  KA_TRACE(10,
2066  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2067  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2068  tid, FALSE);
2069  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2070  &team->t.t_threads[0]
2071  ->th.th_bar[bs_forkjoin_barrier]
2072  .bb.th_fixed_icvs);
2073  }
2074  }
2075 #endif // KMP_BARRIER_ICV_PULL
2076 
2077  if (__kmp_tasking_mode != tskm_immediate_exec) {
2078  __kmp_task_team_sync(this_thr, team);
2079  }
2080 
2081 #if KMP_AFFINITY_SUPPORTED
2082  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2083  if (proc_bind == proc_bind_intel) {
2084  // Call dynamic affinity settings
2085  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2086  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2087  }
2088  } else if (proc_bind != proc_bind_false) {
2089  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2090  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2091  __kmp_gtid_from_thread(this_thr),
2092  this_thr->th.th_current_place));
2093  } else {
2094  __kmp_affinity_set_place(gtid);
2095  }
2096  }
2097 #endif // KMP_AFFINITY_SUPPORTED
2098  // Perform the display affinity functionality
2099  if (__kmp_display_affinity) {
2100  if (team->t.t_display_affinity
2101 #if KMP_AFFINITY_SUPPORTED
2102  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2103 #endif
2104  ) {
2105  // NULL means use the affinity-format-var ICV
2106  __kmp_aux_display_affinity(gtid, NULL);
2107  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2108  this_thr->th.th_prev_level = team->t.t_level;
2109  }
2110  }
2111  if (!KMP_MASTER_TID(tid))
2112  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2113 
2114 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2115  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2116  if (!KMP_MASTER_TID(tid)) {
2117  // Get correct barrier object
2118  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2119  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2120  } // (prepare called inside barrier_release)
2121  }
2122 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2123  ANNOTATE_BARRIER_END(&team->t.t_bar);
2124  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2125  team->t.t_id, tid));
2126 }
2127 
2128 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2129  kmp_internal_control_t *new_icvs, ident_t *loc) {
2130  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2131 
2132  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2133  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2134 
2135 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2136  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2137  implicit task has this data before this function is called. */
2138 #if KMP_BARRIER_ICV_PULL
2139  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2140  untouched), where all of the worker threads can access them and make their
2141  own copies after the barrier. */
2142  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2143  // allocated at this point
2144  copy_icvs(
2145  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2146  new_icvs);
2147  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2148  team->t.t_threads[0], team));
2149 #elif KMP_BARRIER_ICV_PUSH
2150  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2151  // done here.
2152  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2153  team->t.t_threads[0], team));
2154 #else
2155  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2156  // time.
2157  ngo_load(new_icvs);
2158  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2159  // allocated at this point
2160  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2161  // TODO: GEH - pass in better source location info since usually NULL here
2162  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2163  f, team->t.t_threads[f], team));
2164  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2165  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2166  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2167  f, team->t.t_threads[f], team));
2168  }
2169  ngo_sync();
2170 #endif // KMP_BARRIER_ICV_PULL
2171 }
ident
Definition: kmp.h:222