LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #include "tsan_annotations.h"
25 
26 /* forward declaration */
27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
28  kmp_info_t *this_thr);
29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
30  kmp_thread_data_t *thread_data);
31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
32  kmp_task_team_t *task_team);
33 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
34 
35 #ifdef BUILD_TIED_TASK_STACK
36 
37 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
38 // from top do bottom
39 //
40 // gtid: global thread identifier for thread containing stack
41 // thread_data: thread data for task team thread containing stack
42 // threshold: value above which the trace statement triggers
43 // location: string identifying call site of this function (for trace)
44 static void __kmp_trace_task_stack(kmp_int32 gtid,
45  kmp_thread_data_t *thread_data,
46  int threshold, char *location) {
47  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
48  kmp_taskdata_t **stack_top = task_stack->ts_top;
49  kmp_int32 entries = task_stack->ts_entries;
50  kmp_taskdata_t *tied_task;
51 
52  KA_TRACE(
53  threshold,
54  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
55  "first_block = %p, stack_top = %p \n",
56  location, gtid, entries, task_stack->ts_first_block, stack_top));
57 
58  KMP_DEBUG_ASSERT(stack_top != NULL);
59  KMP_DEBUG_ASSERT(entries > 0);
60 
61  while (entries != 0) {
62  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
63  // fix up ts_top if we need to pop from previous block
64  if (entries & TASK_STACK_INDEX_MASK == 0) {
65  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
66 
67  stack_block = stack_block->sb_prev;
68  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
69  }
70 
71  // finish bookkeeping
72  stack_top--;
73  entries--;
74 
75  tied_task = *stack_top;
76 
77  KMP_DEBUG_ASSERT(tied_task != NULL);
78  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
79 
80  KA_TRACE(threshold,
81  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
82  "stack_top=%p, tied_task=%p\n",
83  location, gtid, entries, stack_top, tied_task));
84  }
85  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
86 
87  KA_TRACE(threshold,
88  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
89  location, gtid));
90 }
91 
92 // __kmp_init_task_stack: initialize the task stack for the first time
93 // after a thread_data structure is created.
94 // It should not be necessary to do this again (assuming the stack works).
95 //
96 // gtid: global thread identifier of calling thread
97 // thread_data: thread data for task team thread containing stack
98 static void __kmp_init_task_stack(kmp_int32 gtid,
99  kmp_thread_data_t *thread_data) {
100  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
101  kmp_stack_block_t *first_block;
102 
103  // set up the first block of the stack
104  first_block = &task_stack->ts_first_block;
105  task_stack->ts_top = (kmp_taskdata_t **)first_block;
106  memset((void *)first_block, '\0',
107  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
108 
109  // initialize the stack to be empty
110  task_stack->ts_entries = TASK_STACK_EMPTY;
111  first_block->sb_next = NULL;
112  first_block->sb_prev = NULL;
113 }
114 
115 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
116 //
117 // gtid: global thread identifier for calling thread
118 // thread_data: thread info for thread containing stack
119 static void __kmp_free_task_stack(kmp_int32 gtid,
120  kmp_thread_data_t *thread_data) {
121  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
122  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
123 
124  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
125  // free from the second block of the stack
126  while (stack_block != NULL) {
127  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
128 
129  stack_block->sb_next = NULL;
130  stack_block->sb_prev = NULL;
131  if (stack_block != &task_stack->ts_first_block) {
132  __kmp_thread_free(thread,
133  stack_block); // free the block, if not the first
134  }
135  stack_block = next_block;
136  }
137  // initialize the stack to be empty
138  task_stack->ts_entries = 0;
139  task_stack->ts_top = NULL;
140 }
141 
142 // __kmp_push_task_stack: Push the tied task onto the task stack.
143 // Grow the stack if necessary by allocating another block.
144 //
145 // gtid: global thread identifier for calling thread
146 // thread: thread info for thread containing stack
147 // tied_task: the task to push on the stack
148 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
149  kmp_taskdata_t *tied_task) {
150  // GEH - need to consider what to do if tt_threads_data not allocated yet
151  kmp_thread_data_t *thread_data =
152  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
153  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
154 
155  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
156  return; // Don't push anything on stack if team or team tasks are serialized
157  }
158 
159  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
160  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
161 
162  KA_TRACE(20,
163  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
164  gtid, thread, tied_task));
165  // Store entry
166  *(task_stack->ts_top) = tied_task;
167 
168  // Do bookkeeping for next push
169  task_stack->ts_top++;
170  task_stack->ts_entries++;
171 
172  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
173  // Find beginning of this task block
174  kmp_stack_block_t *stack_block =
175  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
176 
177  // Check if we already have a block
178  if (stack_block->sb_next !=
179  NULL) { // reset ts_top to beginning of next block
180  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
181  } else { // Alloc new block and link it up
182  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
183  thread, sizeof(kmp_stack_block_t));
184 
185  task_stack->ts_top = &new_block->sb_block[0];
186  stack_block->sb_next = new_block;
187  new_block->sb_prev = stack_block;
188  new_block->sb_next = NULL;
189 
190  KA_TRACE(
191  30,
192  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
193  gtid, tied_task, new_block));
194  }
195  }
196  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
197  tied_task));
198 }
199 
200 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
201 // the task, just check to make sure it matches the ending task passed in.
202 //
203 // gtid: global thread identifier for the calling thread
204 // thread: thread info structure containing stack
205 // tied_task: the task popped off the stack
206 // ending_task: the task that is ending (should match popped task)
207 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
208  kmp_taskdata_t *ending_task) {
209  // GEH - need to consider what to do if tt_threads_data not allocated yet
210  kmp_thread_data_t *thread_data =
211  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
212  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
213  kmp_taskdata_t *tied_task;
214 
215  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
216  // Don't pop anything from stack if team or team tasks are serialized
217  return;
218  }
219 
220  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
221  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
222 
223  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
224  thread));
225 
226  // fix up ts_top if we need to pop from previous block
227  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
228  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
229 
230  stack_block = stack_block->sb_prev;
231  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
232  }
233 
234  // finish bookkeeping
235  task_stack->ts_top--;
236  task_stack->ts_entries--;
237 
238  tied_task = *(task_stack->ts_top);
239 
240  KMP_DEBUG_ASSERT(tied_task != NULL);
241  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
242  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
243 
244  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
245  tied_task));
246  return;
247 }
248 #endif /* BUILD_TIED_TASK_STACK */
249 
250 // returns 1 if new task is allowed to execute, 0 otherwise
251 // checks Task Scheduling constraint (if requested) and
252 // mutexinoutset dependencies if any
253 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
254  const kmp_taskdata_t *tasknew,
255  const kmp_taskdata_t *taskcurr) {
256  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
257  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
258  // only descendant of all deferred tied tasks can be scheduled, checking
259  // the last one is enough, as it in turn is the descendant of all others
260  kmp_taskdata_t *current = taskcurr->td_last_tied;
261  KMP_DEBUG_ASSERT(current != NULL);
262  // check if the task is not suspended on barrier
263  if (current->td_flags.tasktype == TASK_EXPLICIT ||
264  current->td_taskwait_thread > 0) { // <= 0 on barrier
265  kmp_int32 level = current->td_level;
266  kmp_taskdata_t *parent = tasknew->td_parent;
267  while (parent != current && parent->td_level > level) {
268  // check generation up to the level of the current task
269  parent = parent->td_parent;
270  KMP_DEBUG_ASSERT(parent != NULL);
271  }
272  if (parent != current)
273  return false;
274  }
275  }
276  // Check mutexinoutset dependencies, acquire locks
277  kmp_depnode_t *node = tasknew->td_depnode;
278  if (node && (node->dn.mtx_num_locks > 0)) {
279  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
280  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
281  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
282  continue;
283  // could not get the lock, release previous locks
284  for (int j = i - 1; j >= 0; --j)
285  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
286  return false;
287  }
288  // negative num_locks means all locks acquired successfully
289  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
290  }
291  return true;
292 }
293 
294 // __kmp_realloc_task_deque:
295 // Re-allocates a task deque for a particular thread, copies the content from
296 // the old deque and adjusts the necessary data structures relating to the
297 // deque. This operation must be done with the deque_lock being held
298 static void __kmp_realloc_task_deque(kmp_info_t *thread,
299  kmp_thread_data_t *thread_data) {
300  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
301  kmp_int32 new_size = 2 * size;
302 
303  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
304  "%d] for thread_data %p\n",
305  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
306 
307  kmp_taskdata_t **new_deque =
308  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
309 
310  int i, j;
311  for (i = thread_data->td.td_deque_head, j = 0; j < size;
312  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
313  new_deque[j] = thread_data->td.td_deque[i];
314 
315  __kmp_free(thread_data->td.td_deque);
316 
317  thread_data->td.td_deque_head = 0;
318  thread_data->td.td_deque_tail = size;
319  thread_data->td.td_deque = new_deque;
320  thread_data->td.td_deque_size = new_size;
321 }
322 
323 // __kmp_push_task: Add a task to the thread's deque
324 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
325  kmp_info_t *thread = __kmp_threads[gtid];
326  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
327  kmp_task_team_t *task_team = thread->th.th_task_team;
328  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
329  kmp_thread_data_t *thread_data;
330 
331  KA_TRACE(20,
332  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
333 
334  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
335  // untied task needs to increment counter so that the task structure is not
336  // freed prematurely
337  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
338  KMP_DEBUG_USE_VAR(counter);
339  KA_TRACE(
340  20,
341  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
342  gtid, counter, taskdata));
343  }
344 
345  // The first check avoids building task_team thread data if serialized
346  if (taskdata->td_flags.task_serial) {
347  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
348  "TASK_NOT_PUSHED for task %p\n",
349  gtid, taskdata));
350  return TASK_NOT_PUSHED;
351  }
352 
353  // Now that serialized tasks have returned, we can assume that we are not in
354  // immediate exec mode
355  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
356  if (!KMP_TASKING_ENABLED(task_team)) {
357  __kmp_enable_tasking(task_team, thread);
358  }
359  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
360  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
361 
362  // Find tasking deque specific to encountering thread
363  thread_data = &task_team->tt.tt_threads_data[tid];
364 
365  // No lock needed since only owner can allocate
366  if (thread_data->td.td_deque == NULL) {
367  __kmp_alloc_task_deque(thread, thread_data);
368  }
369 
370  int locked = 0;
371  // Check if deque is full
372  if (TCR_4(thread_data->td.td_deque_ntasks) >=
373  TASK_DEQUE_SIZE(thread_data->td)) {
374  if (__kmp_enable_task_throttling &&
375  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
376  thread->th.th_current_task)) {
377  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
378  "TASK_NOT_PUSHED for task %p\n",
379  gtid, taskdata));
380  return TASK_NOT_PUSHED;
381  } else {
382  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
383  locked = 1;
384  // expand deque to push the task which is not allowed to execute
385  __kmp_realloc_task_deque(thread, thread_data);
386  }
387  }
388  // Lock the deque for the task push operation
389  if (!locked) {
390  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
391  // Need to recheck as we can get a proxy task from thread outside of OpenMP
392  if (TCR_4(thread_data->td.td_deque_ntasks) >=
393  TASK_DEQUE_SIZE(thread_data->td)) {
394  if (__kmp_enable_task_throttling &&
395  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
396  thread->th.th_current_task)) {
397  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
398  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
399  "returning TASK_NOT_PUSHED for task %p\n",
400  gtid, taskdata));
401  return TASK_NOT_PUSHED;
402  } else {
403  // expand deque to push the task which is not allowed to execute
404  __kmp_realloc_task_deque(thread, thread_data);
405  }
406  }
407  }
408  // Must have room since no thread can add tasks but calling thread
409  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
410  TASK_DEQUE_SIZE(thread_data->td));
411 
412  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
413  taskdata; // Push taskdata
414  // Wrap index.
415  thread_data->td.td_deque_tail =
416  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
417  TCW_4(thread_data->td.td_deque_ntasks,
418  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
419 
420  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
421  "task=%p ntasks=%d head=%u tail=%u\n",
422  gtid, taskdata, thread_data->td.td_deque_ntasks,
423  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
424 
425  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
426 
427  return TASK_SUCCESSFULLY_PUSHED;
428 }
429 
430 // __kmp_pop_current_task_from_thread: set up current task from called thread
431 // when team ends
432 //
433 // this_thr: thread structure to set current_task in.
434 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
435  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
436  "this_thread=%p, curtask=%p, "
437  "curtask_parent=%p\n",
438  0, this_thr, this_thr->th.th_current_task,
439  this_thr->th.th_current_task->td_parent));
440 
441  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
442 
443  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
444  "this_thread=%p, curtask=%p, "
445  "curtask_parent=%p\n",
446  0, this_thr, this_thr->th.th_current_task,
447  this_thr->th.th_current_task->td_parent));
448 }
449 
450 // __kmp_push_current_task_to_thread: set up current task in called thread for a
451 // new team
452 //
453 // this_thr: thread structure to set up
454 // team: team for implicit task data
455 // tid: thread within team to set up
456 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
457  int tid) {
458  // current task of the thread is a parent of the new just created implicit
459  // tasks of new team
460  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
461  "curtask=%p "
462  "parent_task=%p\n",
463  tid, this_thr, this_thr->th.th_current_task,
464  team->t.t_implicit_task_taskdata[tid].td_parent));
465 
466  KMP_DEBUG_ASSERT(this_thr != NULL);
467 
468  if (tid == 0) {
469  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
470  team->t.t_implicit_task_taskdata[0].td_parent =
471  this_thr->th.th_current_task;
472  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
473  }
474  } else {
475  team->t.t_implicit_task_taskdata[tid].td_parent =
476  team->t.t_implicit_task_taskdata[0].td_parent;
477  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
478  }
479 
480  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
481  "curtask=%p "
482  "parent_task=%p\n",
483  tid, this_thr, this_thr->th.th_current_task,
484  team->t.t_implicit_task_taskdata[tid].td_parent));
485 }
486 
487 // __kmp_task_start: bookkeeping for a task starting execution
488 //
489 // GTID: global thread id of calling thread
490 // task: task starting execution
491 // current_task: task suspending
492 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
493  kmp_taskdata_t *current_task) {
494  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
495  kmp_info_t *thread = __kmp_threads[gtid];
496 
497  KA_TRACE(10,
498  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
499  gtid, taskdata, current_task));
500 
501  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
502 
503  // mark currently executing task as suspended
504  // TODO: GEH - make sure root team implicit task is initialized properly.
505  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
506  current_task->td_flags.executing = 0;
507 
508 // Add task to stack if tied
509 #ifdef BUILD_TIED_TASK_STACK
510  if (taskdata->td_flags.tiedness == TASK_TIED) {
511  __kmp_push_task_stack(gtid, thread, taskdata);
512  }
513 #endif /* BUILD_TIED_TASK_STACK */
514 
515  // mark starting task as executing and as current task
516  thread->th.th_current_task = taskdata;
517 
518  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
519  taskdata->td_flags.tiedness == TASK_UNTIED);
520  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
521  taskdata->td_flags.tiedness == TASK_UNTIED);
522  taskdata->td_flags.started = 1;
523  taskdata->td_flags.executing = 1;
524  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
525  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
526 
527  // GEH TODO: shouldn't we pass some sort of location identifier here?
528  // APT: yes, we will pass location here.
529  // need to store current thread state (in a thread or taskdata structure)
530  // before setting work_state, otherwise wrong state is set after end of task
531 
532  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
533 
534  return;
535 }
536 
537 #if OMPT_SUPPORT
538 //------------------------------------------------------------------------------
539 // __ompt_task_init:
540 // Initialize OMPT fields maintained by a task. This will only be called after
541 // ompt_start_tool, so we already know whether ompt is enabled or not.
542 
543 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
544  // The calls to __ompt_task_init already have the ompt_enabled condition.
545  task->ompt_task_info.task_data.value = 0;
546  task->ompt_task_info.frame.exit_frame = ompt_data_none;
547  task->ompt_task_info.frame.enter_frame = ompt_data_none;
548  task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
549  task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
550  task->ompt_task_info.ndeps = 0;
551  task->ompt_task_info.deps = NULL;
552 }
553 
554 // __ompt_task_start:
555 // Build and trigger task-begin event
556 static inline void __ompt_task_start(kmp_task_t *task,
557  kmp_taskdata_t *current_task,
558  kmp_int32 gtid) {
559  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
560  ompt_task_status_t status = ompt_task_switch;
561  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
562  status = ompt_task_yield;
563  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
564  }
565  /* let OMPT know that we're about to run this task */
566  if (ompt_enabled.ompt_callback_task_schedule) {
567  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
568  &(current_task->ompt_task_info.task_data), status,
569  &(taskdata->ompt_task_info.task_data));
570  }
571  taskdata->ompt_task_info.scheduling_parent = current_task;
572 }
573 
574 // __ompt_task_finish:
575 // Build and trigger final task-schedule event
576 static inline void
577 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
578  ompt_task_status_t status = ompt_task_complete) {
579  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
580  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
581  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
582  status = ompt_task_cancel;
583  }
584 
585  /* let OMPT know that we're returning to the callee task */
586  if (ompt_enabled.ompt_callback_task_schedule) {
587  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
588  &(taskdata->ompt_task_info.task_data), status,
589  &((resumed_task ? resumed_task
590  : (taskdata->ompt_task_info.scheduling_parent
591  ? taskdata->ompt_task_info.scheduling_parent
592  : taskdata->td_parent))
593  ->ompt_task_info.task_data));
594  }
595 }
596 #endif
597 
598 template <bool ompt>
599 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
600  kmp_task_t *task,
601  void *frame_address,
602  void *return_address) {
603  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
604  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
605 
606  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
607  "current_task=%p\n",
608  gtid, loc_ref, taskdata, current_task));
609 
610  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
611  // untied task needs to increment counter so that the task structure is not
612  // freed prematurely
613  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
614  KMP_DEBUG_USE_VAR(counter);
615  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
616  "incremented for task %p\n",
617  gtid, counter, taskdata));
618  }
619 
620  taskdata->td_flags.task_serial =
621  1; // Execute this task immediately, not deferred.
622  __kmp_task_start(gtid, task, current_task);
623 
624 #if OMPT_SUPPORT
625  if (ompt) {
626  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
627  current_task->ompt_task_info.frame.enter_frame.ptr =
628  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
629  current_task->ompt_task_info.frame.enter_frame_flags =
630  taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
631  }
632  if (ompt_enabled.ompt_callback_task_create) {
633  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
634  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
635  &(parent_info->task_data), &(parent_info->frame),
636  &(taskdata->ompt_task_info.task_data),
637  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
638  return_address);
639  }
640  __ompt_task_start(task, current_task, gtid);
641  }
642 #endif // OMPT_SUPPORT
643 
644  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
645  loc_ref, taskdata));
646 }
647 
648 #if OMPT_SUPPORT
649 OMPT_NOINLINE
650 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
651  kmp_task_t *task,
652  void *frame_address,
653  void *return_address) {
654  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
655  return_address);
656 }
657 #endif // OMPT_SUPPORT
658 
659 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
660 // execution
661 //
662 // loc_ref: source location information; points to beginning of task block.
663 // gtid: global thread number.
664 // task: task thunk for the started task.
665 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
666  kmp_task_t *task) {
667 #if OMPT_SUPPORT
668  if (UNLIKELY(ompt_enabled.enabled)) {
669  OMPT_STORE_RETURN_ADDRESS(gtid);
670  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
671  OMPT_GET_FRAME_ADDRESS(1),
672  OMPT_LOAD_RETURN_ADDRESS(gtid));
673  return;
674  }
675 #endif
676  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
677 }
678 
679 #ifdef TASK_UNUSED
680 // __kmpc_omp_task_begin: report that a given task has started execution
681 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
682 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
683  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
684 
685  KA_TRACE(
686  10,
687  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
688  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
689 
690  __kmp_task_start(gtid, task, current_task);
691 
692  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
693  loc_ref, KMP_TASK_TO_TASKDATA(task)));
694  return;
695 }
696 #endif // TASK_UNUSED
697 
698 // __kmp_free_task: free the current task space and the space for shareds
699 //
700 // gtid: Global thread ID of calling thread
701 // taskdata: task to free
702 // thread: thread data structure of caller
703 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
704  kmp_info_t *thread) {
705  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
706  taskdata));
707 
708  // Check to make sure all flags and counters have the correct values
709  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
710  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
711  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
712  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
713  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
714  taskdata->td_flags.task_serial == 1);
715  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
716 
717  taskdata->td_flags.freed = 1;
718  ANNOTATE_HAPPENS_BEFORE(taskdata);
719 // deallocate the taskdata and shared variable blocks associated with this task
720 #if USE_FAST_MEMORY
721  __kmp_fast_free(thread, taskdata);
722 #else /* ! USE_FAST_MEMORY */
723  __kmp_thread_free(thread, taskdata);
724 #endif
725 
726  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
727 }
728 
729 // __kmp_free_task_and_ancestors: free the current task and ancestors without
730 // children
731 //
732 // gtid: Global thread ID of calling thread
733 // taskdata: task to free
734 // thread: thread data structure of caller
735 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
736  kmp_taskdata_t *taskdata,
737  kmp_info_t *thread) {
738  // Proxy tasks must always be allowed to free their parents
739  // because they can be run in background even in serial mode.
740  kmp_int32 team_serial =
741  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
742  !taskdata->td_flags.proxy;
743  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
744 
745  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
746  KMP_DEBUG_ASSERT(children >= 0);
747 
748  // Now, go up the ancestor tree to see if any ancestors can now be freed.
749  while (children == 0) {
750  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
751 
752  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
753  "and freeing itself\n",
754  gtid, taskdata));
755 
756  // --- Deallocate my ancestor task ---
757  __kmp_free_task(gtid, taskdata, thread);
758 
759  taskdata = parent_taskdata;
760 
761  if (team_serial)
762  return;
763  // Stop checking ancestors at implicit task instead of walking up ancestor
764  // tree to avoid premature deallocation of ancestors.
765  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
766  if (taskdata->td_dephash) { // do we need to cleanup dephash?
767  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
768  kmp_tasking_flags_t flags_old = taskdata->td_flags;
769  if (children == 0 && flags_old.complete == 1) {
770  kmp_tasking_flags_t flags_new = flags_old;
771  flags_new.complete = 0;
772  if (KMP_COMPARE_AND_STORE_ACQ32(
773  RCAST(kmp_int32 *, &taskdata->td_flags),
774  *RCAST(kmp_int32 *, &flags_old),
775  *RCAST(kmp_int32 *, &flags_new))) {
776  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
777  "dephash of implicit task %p\n",
778  gtid, taskdata));
779  // cleanup dephash of finished implicit task
780  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
781  }
782  }
783  }
784  return;
785  }
786  // Predecrement simulated by "- 1" calculation
787  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
788  KMP_DEBUG_ASSERT(children >= 0);
789  }
790 
791  KA_TRACE(
792  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
793  "not freeing it yet\n",
794  gtid, taskdata, children));
795 }
796 
797 // __kmp_task_finish: bookkeeping to do when a task finishes execution
798 //
799 // gtid: global thread ID for calling thread
800 // task: task to be finished
801 // resumed_task: task to be resumed. (may be NULL if task is serialized)
802 template <bool ompt>
803 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
804  kmp_taskdata_t *resumed_task) {
805  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
806  kmp_info_t *thread = __kmp_threads[gtid];
807  kmp_task_team_t *task_team =
808  thread->th.th_task_team; // might be NULL for serial teams...
809  kmp_int32 children = 0;
810 
811  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
812  "task %p\n",
813  gtid, taskdata, resumed_task));
814 
815  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
816 
817 // Pop task from stack if tied
818 #ifdef BUILD_TIED_TASK_STACK
819  if (taskdata->td_flags.tiedness == TASK_TIED) {
820  __kmp_pop_task_stack(gtid, thread, taskdata);
821  }
822 #endif /* BUILD_TIED_TASK_STACK */
823 
824  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
825  // untied task needs to check the counter so that the task structure is not
826  // freed prematurely
827  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
828  KA_TRACE(
829  20,
830  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
831  gtid, counter, taskdata));
832  if (counter > 0) {
833  // untied task is not done, to be continued possibly by other thread, do
834  // not free it now
835  if (resumed_task == NULL) {
836  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
837  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
838  // task is the parent
839  }
840  thread->th.th_current_task = resumed_task; // restore current_task
841  resumed_task->td_flags.executing = 1; // resume previous task
842  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
843  "resuming task %p\n",
844  gtid, taskdata, resumed_task));
845  return;
846  }
847  }
848 #if OMPT_SUPPORT
849  if (ompt)
850  __ompt_task_finish(task, resumed_task);
851 #endif
852 
853  // Check mutexinoutset dependencies, release locks
854  kmp_depnode_t *node = taskdata->td_depnode;
855  if (node && (node->dn.mtx_num_locks < 0)) {
856  // negative num_locks means all locks were acquired
857  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
858  for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
859  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
860  __kmp_release_lock(node->dn.mtx_locks[i], gtid);
861  }
862  }
863 
864  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
865  bool detach = false;
866  if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
867  if (taskdata->td_allow_completion_event.type ==
868  KMP_EVENT_ALLOW_COMPLETION) {
869  // event hasn't been fulfilled yet. Try to detach task.
870  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
871  if (taskdata->td_allow_completion_event.type ==
872  KMP_EVENT_ALLOW_COMPLETION) {
873  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
874  detach = true;
875  }
876  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
877  }
878  }
879  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
880  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
881 
882  if (!detach) {
883  taskdata->td_flags.complete = 1; // mark the task as completed
884 
885  // Only need to keep track of count if team parallel and tasking not
886  // serialized
887  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
888  // Predecrement simulated by "- 1" calculation
889  children =
890  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
891  KMP_DEBUG_ASSERT(children >= 0);
892  if (taskdata->td_taskgroup)
893  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
894  __kmp_release_deps(gtid, taskdata);
895  } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
896  // if we found proxy tasks there could exist a dependency chain
897  // with the proxy task as origin
898  __kmp_release_deps(gtid, taskdata);
899  }
900  }
901 
902  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
903  // called. Othertwise, if a task is executed immediately from the release_deps
904  // code, the flag will be reset to 1 again by this same function
905  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
906  taskdata->td_flags.executing = 0; // suspend the finishing task
907 
908  KA_TRACE(
909  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
910  gtid, taskdata, children));
911 
912  /* If the tasks' destructor thunk flag has been set, we need to invoke the
913  destructor thunk that has been generated by the compiler. The code is
914  placed here, since at this point other tasks might have been released
915  hence overlapping the destructor invokations with some other work in the
916  released tasks. The OpenMP spec is not specific on when the destructors
917  are invoked, so we should be free to choose. */
918  if (taskdata->td_flags.destructors_thunk) {
919  kmp_routine_entry_t destr_thunk = task->data1.destructors;
920  KMP_ASSERT(destr_thunk);
921  destr_thunk(gtid, task);
922  }
923 
924  // bookkeeping for resuming task:
925  // GEH - note tasking_ser => task_serial
926  KMP_DEBUG_ASSERT(
927  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
928  taskdata->td_flags.task_serial);
929  if (taskdata->td_flags.task_serial) {
930  if (resumed_task == NULL) {
931  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
932  // task is the parent
933  }
934  } else {
935  KMP_DEBUG_ASSERT(resumed_task !=
936  NULL); // verify that resumed task is passed as argument
937  }
938 
939  // Free this task and then ancestor tasks if they have no children.
940  // Restore th_current_task first as suggested by John:
941  // johnmc: if an asynchronous inquiry peers into the runtime system
942  // it doesn't see the freed task as the current task.
943  thread->th.th_current_task = resumed_task;
944  if (!detach)
945  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
946 
947  // TODO: GEH - make sure root team implicit task is initialized properly.
948  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
949  resumed_task->td_flags.executing = 1; // resume previous task
950 
951  KA_TRACE(
952  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
953  gtid, taskdata, resumed_task));
954 
955  return;
956 }
957 
958 template <bool ompt>
959 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
960  kmp_int32 gtid,
961  kmp_task_t *task) {
962  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
963  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
964  // this routine will provide task to resume
965  __kmp_task_finish<ompt>(gtid, task, NULL);
966 
967  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
968  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
969 
970 #if OMPT_SUPPORT
971  if (ompt) {
972  ompt_frame_t *ompt_frame;
973  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
974  ompt_frame->enter_frame = ompt_data_none;
975  ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
976  }
977 #endif
978 
979  return;
980 }
981 
982 #if OMPT_SUPPORT
983 OMPT_NOINLINE
984 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
985  kmp_task_t *task) {
986  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
987 }
988 #endif // OMPT_SUPPORT
989 
990 // __kmpc_omp_task_complete_if0: report that a task has completed execution
991 //
992 // loc_ref: source location information; points to end of task block.
993 // gtid: global thread number.
994 // task: task thunk for the completed task.
995 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
996  kmp_task_t *task) {
997 #if OMPT_SUPPORT
998  if (UNLIKELY(ompt_enabled.enabled)) {
999  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1000  return;
1001  }
1002 #endif
1003  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1004 }
1005 
1006 #ifdef TASK_UNUSED
1007 // __kmpc_omp_task_complete: report that a task has completed execution
1008 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1009 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1010  kmp_task_t *task) {
1011  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1012  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1013 
1014  __kmp_task_finish<false>(gtid, task,
1015  NULL); // Not sure how to find task to resume
1016 
1017  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1018  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1019  return;
1020 }
1021 #endif // TASK_UNUSED
1022 
1023 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1024 // task for a given thread
1025 //
1026 // loc_ref: reference to source location of parallel region
1027 // this_thr: thread data structure corresponding to implicit task
1028 // team: team for this_thr
1029 // tid: thread id of given thread within team
1030 // set_curr_task: TRUE if need to push current task to thread
1031 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1032 // have already been done elsewhere.
1033 // TODO: Get better loc_ref. Value passed in may be NULL
1034 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1035  kmp_team_t *team, int tid, int set_curr_task) {
1036  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1037 
1038  KF_TRACE(
1039  10,
1040  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1041  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1042 
1043  task->td_task_id = KMP_GEN_TASK_ID();
1044  task->td_team = team;
1045  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1046  // in debugger)
1047  task->td_ident = loc_ref;
1048  task->td_taskwait_ident = NULL;
1049  task->td_taskwait_counter = 0;
1050  task->td_taskwait_thread = 0;
1051 
1052  task->td_flags.tiedness = TASK_TIED;
1053  task->td_flags.tasktype = TASK_IMPLICIT;
1054  task->td_flags.proxy = TASK_FULL;
1055 
1056  // All implicit tasks are executed immediately, not deferred
1057  task->td_flags.task_serial = 1;
1058  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1059  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1060 
1061  task->td_flags.started = 1;
1062  task->td_flags.executing = 1;
1063  task->td_flags.complete = 0;
1064  task->td_flags.freed = 0;
1065 
1066  task->td_depnode = NULL;
1067  task->td_last_tied = task;
1068  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1069 
1070  if (set_curr_task) { // only do this init first time thread is created
1071  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1072  // Not used: don't need to deallocate implicit task
1073  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1074  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1075  task->td_dephash = NULL;
1076  __kmp_push_current_task_to_thread(this_thr, team, tid);
1077  } else {
1078  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1079  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1080  }
1081 
1082 #if OMPT_SUPPORT
1083  if (UNLIKELY(ompt_enabled.enabled))
1084  __ompt_task_init(task, tid);
1085 #endif
1086 
1087  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1088  team, task));
1089 }
1090 
1091 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1092 // at the end of parallel regions. Some resources are kept for reuse in the next
1093 // parallel region.
1094 //
1095 // thread: thread data structure corresponding to implicit task
1096 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1097  kmp_taskdata_t *task = thread->th.th_current_task;
1098  if (task->td_dephash) {
1099  int children;
1100  task->td_flags.complete = 1;
1101  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1102  kmp_tasking_flags_t flags_old = task->td_flags;
1103  if (children == 0 && flags_old.complete == 1) {
1104  kmp_tasking_flags_t flags_new = flags_old;
1105  flags_new.complete = 0;
1106  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1107  *RCAST(kmp_int32 *, &flags_old),
1108  *RCAST(kmp_int32 *, &flags_new))) {
1109  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1110  "dephash of implicit task %p\n",
1111  thread->th.th_info.ds.ds_gtid, task));
1112  __kmp_dephash_free_entries(thread, task->td_dephash);
1113  }
1114  }
1115  }
1116 }
1117 
1118 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1119 // when these are destroyed regions
1120 //
1121 // thread: thread data structure corresponding to implicit task
1122 void __kmp_free_implicit_task(kmp_info_t *thread) {
1123  kmp_taskdata_t *task = thread->th.th_current_task;
1124  if (task && task->td_dephash) {
1125  __kmp_dephash_free(thread, task->td_dephash);
1126  task->td_dephash = NULL;
1127  }
1128 }
1129 
1130 // Round up a size to a power of two specified by val: Used to insert padding
1131 // between structures co-allocated using a single malloc() call
1132 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1133  if (size & (val - 1)) {
1134  size &= ~(val - 1);
1135  if (size <= KMP_SIZE_T_MAX - val) {
1136  size += val; // Round up if there is no overflow.
1137  }
1138  }
1139  return size;
1140 } // __kmp_round_up_to_va
1141 
1142 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1143 //
1144 // loc_ref: source location information
1145 // gtid: global thread number.
1146 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1147 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1148 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1149 // private vars accessed in task.
1150 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1151 // in task.
1152 // task_entry: Pointer to task code entry point generated by compiler.
1153 // returns: a pointer to the allocated kmp_task_t structure (task).
1154 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1155  kmp_tasking_flags_t *flags,
1156  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1157  kmp_routine_entry_t task_entry) {
1158  kmp_task_t *task;
1159  kmp_taskdata_t *taskdata;
1160  kmp_info_t *thread = __kmp_threads[gtid];
1161  kmp_team_t *team = thread->th.th_team;
1162  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1163  size_t shareds_offset;
1164 
1165  if (!TCR_4(__kmp_init_middle))
1166  __kmp_middle_initialize();
1167 
1168  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1169  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1170  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1171  sizeof_shareds, task_entry));
1172 
1173  if (parent_task->td_flags.final) {
1174  if (flags->merged_if0) {
1175  }
1176  flags->final = 1;
1177  }
1178  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1179  // Untied task encountered causes the TSC algorithm to check entire deque of
1180  // the victim thread. If no untied task encountered, then checking the head
1181  // of the deque should be enough.
1182  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1183  }
1184 
1185  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1186  // the tasking setup
1187  // when that happens is too late.
1188  if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) {
1189  if (flags->proxy == TASK_PROXY) {
1190  flags->tiedness = TASK_UNTIED;
1191  flags->merged_if0 = 1;
1192  }
1193  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1194  tasking support enabled */
1195  if ((thread->th.th_task_team) == NULL) {
1196  /* This should only happen if the team is serialized
1197  setup a task team and propagate it to the thread */
1198  KMP_DEBUG_ASSERT(team->t.t_serialized);
1199  KA_TRACE(30,
1200  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1201  gtid));
1202  __kmp_task_team_setup(
1203  thread, team,
1204  1); // 1 indicates setup the current team regardless of nthreads
1205  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1206  }
1207  kmp_task_team_t *task_team = thread->th.th_task_team;
1208 
1209  /* tasking must be enabled now as the task might not be pushed */
1210  if (!KMP_TASKING_ENABLED(task_team)) {
1211  KA_TRACE(
1212  30,
1213  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1214  __kmp_enable_tasking(task_team, thread);
1215  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1216  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1217  // No lock needed since only owner can allocate
1218  if (thread_data->td.td_deque == NULL) {
1219  __kmp_alloc_task_deque(thread, thread_data);
1220  }
1221  }
1222 
1223  if (task_team->tt.tt_found_proxy_tasks == FALSE)
1224  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1225  }
1226 
1227  // Calculate shared structure offset including padding after kmp_task_t struct
1228  // to align pointers in shared struct
1229  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1230  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1231 
1232  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1233  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1234  shareds_offset));
1235  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1236  sizeof_shareds));
1237 
1238 // Avoid double allocation here by combining shareds with taskdata
1239 #if USE_FAST_MEMORY
1240  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1241  sizeof_shareds);
1242 #else /* ! USE_FAST_MEMORY */
1243  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1244  sizeof_shareds);
1245 #endif /* USE_FAST_MEMORY */
1246  ANNOTATE_HAPPENS_AFTER(taskdata);
1247 
1248  task = KMP_TASKDATA_TO_TASK(taskdata);
1249 
1250 // Make sure task & taskdata are aligned appropriately
1251 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1252  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1253  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1254 #else
1255  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1256  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1257 #endif
1258  if (sizeof_shareds > 0) {
1259  // Avoid double allocation here by combining shareds with taskdata
1260  task->shareds = &((char *)taskdata)[shareds_offset];
1261  // Make sure shareds struct is aligned to pointer size
1262  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1263  0);
1264  } else {
1265  task->shareds = NULL;
1266  }
1267  task->routine = task_entry;
1268  task->part_id = 0; // AC: Always start with 0 part id
1269 
1270  taskdata->td_task_id = KMP_GEN_TASK_ID();
1271  taskdata->td_team = team;
1272  taskdata->td_alloc_thread = thread;
1273  taskdata->td_parent = parent_task;
1274  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1275  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1276  taskdata->td_ident = loc_ref;
1277  taskdata->td_taskwait_ident = NULL;
1278  taskdata->td_taskwait_counter = 0;
1279  taskdata->td_taskwait_thread = 0;
1280  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1281  // avoid copying icvs for proxy tasks
1282  if (flags->proxy == TASK_FULL)
1283  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1284 
1285  taskdata->td_flags.tiedness = flags->tiedness;
1286  taskdata->td_flags.final = flags->final;
1287  taskdata->td_flags.merged_if0 = flags->merged_if0;
1288  taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1289  taskdata->td_flags.proxy = flags->proxy;
1290  taskdata->td_flags.detachable = flags->detachable;
1291  taskdata->td_task_team = thread->th.th_task_team;
1292  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1293  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1294 
1295  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1296  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1297 
1298  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1299  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1300 
1301  // GEH - Note we serialize the task if the team is serialized to make sure
1302  // implicit parallel region tasks are not left until program termination to
1303  // execute. Also, it helps locality to execute immediately.
1304 
1305  taskdata->td_flags.task_serial =
1306  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1307  taskdata->td_flags.tasking_ser);
1308 
1309  taskdata->td_flags.started = 0;
1310  taskdata->td_flags.executing = 0;
1311  taskdata->td_flags.complete = 0;
1312  taskdata->td_flags.freed = 0;
1313 
1314  taskdata->td_flags.native = flags->native;
1315 
1316  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1317  // start at one because counts current task and children
1318  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1319  taskdata->td_taskgroup =
1320  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1321  taskdata->td_dephash = NULL;
1322  taskdata->td_depnode = NULL;
1323  if (flags->tiedness == TASK_UNTIED)
1324  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1325  else
1326  taskdata->td_last_tied = taskdata;
1327  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1328 #if OMPT_SUPPORT
1329  if (UNLIKELY(ompt_enabled.enabled))
1330  __ompt_task_init(taskdata, gtid);
1331 #endif
1332 // Only need to keep track of child task counts if team parallel and tasking not
1333 // serialized or if it is a proxy or detachable task
1334  if (flags->proxy == TASK_PROXY ||
1335  flags->detachable == TASK_DETACHABLE ||
1336  !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1337  {
1338  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1339  if (parent_task->td_taskgroup)
1340  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1341  // Only need to keep track of allocated child tasks for explicit tasks since
1342  // implicit not deallocated
1343  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1344  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1345  }
1346  }
1347 
1348  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1349  gtid, taskdata, taskdata->td_parent));
1350  ANNOTATE_HAPPENS_BEFORE(task);
1351 
1352  return task;
1353 }
1354 
1355 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1356  kmp_int32 flags, size_t sizeof_kmp_task_t,
1357  size_t sizeof_shareds,
1358  kmp_routine_entry_t task_entry) {
1359  kmp_task_t *retval;
1360  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1361 
1362  input_flags->native = FALSE;
1363 // __kmp_task_alloc() sets up all other runtime flags
1364 
1365  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1366  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1367  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1368  input_flags->proxy ? "proxy" : "",
1369  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1370  sizeof_shareds, task_entry));
1371 
1372  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1373  sizeof_shareds, task_entry);
1374 
1375  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1376 
1377  return retval;
1378 }
1379 
1380 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1381  kmp_int32 flags,
1382  size_t sizeof_kmp_task_t,
1383  size_t sizeof_shareds,
1384  kmp_routine_entry_t task_entry,
1385  kmp_int64 device_id) {
1386  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1387  sizeof_shareds, task_entry);
1388 }
1389 
1403 kmp_int32
1405  kmp_task_t *new_task, kmp_int32 naffins,
1406  kmp_task_affinity_info_t *affin_list) {
1407  return 0;
1408 }
1409 
1410 // __kmp_invoke_task: invoke the specified task
1411 //
1412 // gtid: global thread ID of caller
1413 // task: the task to invoke
1414 // current_task: the task to resume after task invokation
1415 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1416  kmp_taskdata_t *current_task) {
1417  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1418  kmp_info_t *thread;
1419  int discard = 0 /* false */;
1420  KA_TRACE(
1421  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1422  gtid, taskdata, current_task));
1423  KMP_DEBUG_ASSERT(task);
1424  if (taskdata->td_flags.proxy == TASK_PROXY &&
1425  taskdata->td_flags.complete == 1) {
1426  // This is a proxy task that was already completed but it needs to run
1427  // its bottom-half finish
1428  KA_TRACE(
1429  30,
1430  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1431  gtid, taskdata));
1432 
1433  __kmp_bottom_half_finish_proxy(gtid, task);
1434 
1435  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1436  "proxy task %p, resuming task %p\n",
1437  gtid, taskdata, current_task));
1438 
1439  return;
1440  }
1441 
1442 #if OMPT_SUPPORT
1443  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1444  // does not execute code.
1445  ompt_thread_info_t oldInfo;
1446  if (UNLIKELY(ompt_enabled.enabled)) {
1447  // Store the threads states and restore them after the task
1448  thread = __kmp_threads[gtid];
1449  oldInfo = thread->th.ompt_thread_info;
1450  thread->th.ompt_thread_info.wait_id = 0;
1451  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1452  ? ompt_state_work_serial
1453  : ompt_state_work_parallel;
1454  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1455  }
1456 #endif
1457 
1458  // Proxy tasks are not handled by the runtime
1459  if (taskdata->td_flags.proxy != TASK_PROXY) {
1460  ANNOTATE_HAPPENS_AFTER(task);
1461  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1462  }
1463 
1464  // TODO: cancel tasks if the parallel region has also been cancelled
1465  // TODO: check if this sequence can be hoisted above __kmp_task_start
1466  // if cancellation has been enabled for this run ...
1467  if (__kmp_omp_cancellation) {
1468  thread = __kmp_threads[gtid];
1469  kmp_team_t *this_team = thread->th.th_team;
1470  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1471  if ((taskgroup && taskgroup->cancel_request) ||
1472  (this_team->t.t_cancel_request == cancel_parallel)) {
1473 #if OMPT_SUPPORT && OMPT_OPTIONAL
1474  ompt_data_t *task_data;
1475  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1476  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1477  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1478  task_data,
1479  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1480  : ompt_cancel_parallel) |
1481  ompt_cancel_discarded_task,
1482  NULL);
1483  }
1484 #endif
1485  KMP_COUNT_BLOCK(TASK_cancelled);
1486  // this task belongs to a task group and we need to cancel it
1487  discard = 1 /* true */;
1488  }
1489  }
1490 
1491  // Invoke the task routine and pass in relevant data.
1492  // Thunks generated by gcc take a different argument list.
1493  if (!discard) {
1494  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1495  taskdata->td_last_tied = current_task->td_last_tied;
1496  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1497  }
1498 #if KMP_STATS_ENABLED
1499  KMP_COUNT_BLOCK(TASK_executed);
1500  switch (KMP_GET_THREAD_STATE()) {
1501  case FORK_JOIN_BARRIER:
1502  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1503  break;
1504  case PLAIN_BARRIER:
1505  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1506  break;
1507  case TASKYIELD:
1508  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1509  break;
1510  case TASKWAIT:
1511  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1512  break;
1513  case TASKGROUP:
1514  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1515  break;
1516  default:
1517  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1518  break;
1519  }
1520 #endif // KMP_STATS_ENABLED
1521 
1522 // OMPT task begin
1523 #if OMPT_SUPPORT
1524  if (UNLIKELY(ompt_enabled.enabled))
1525  __ompt_task_start(task, current_task, gtid);
1526 #endif
1527 
1528 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1529  kmp_uint64 cur_time;
1530  kmp_int32 kmp_itt_count_task =
1531  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1532  current_task->td_flags.tasktype == TASK_IMPLICIT;
1533  if (kmp_itt_count_task) {
1534  thread = __kmp_threads[gtid];
1535  // Time outer level explicit task on barrier for adjusting imbalance time
1536  if (thread->th.th_bar_arrive_time)
1537  cur_time = __itt_get_timestamp();
1538  else
1539  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1540  }
1541 #endif
1542 
1543 #ifdef KMP_GOMP_COMPAT
1544  if (taskdata->td_flags.native) {
1545  ((void (*)(void *))(*(task->routine)))(task->shareds);
1546  } else
1547 #endif /* KMP_GOMP_COMPAT */
1548  {
1549  (*(task->routine))(gtid, task);
1550  }
1551  KMP_POP_PARTITIONED_TIMER();
1552 
1553 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1554  if (kmp_itt_count_task) {
1555  // Barrier imbalance - adjust arrive time with the task duration
1556  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1557  }
1558 #endif
1559 
1560  }
1561 
1562 
1563  // Proxy tasks are not handled by the runtime
1564  if (taskdata->td_flags.proxy != TASK_PROXY) {
1565  ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1566 #if OMPT_SUPPORT
1567  if (UNLIKELY(ompt_enabled.enabled)) {
1568  thread->th.ompt_thread_info = oldInfo;
1569  if (taskdata->td_flags.tiedness == TASK_TIED) {
1570  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1571  }
1572  __kmp_task_finish<true>(gtid, task, current_task);
1573  } else
1574 #endif
1575  __kmp_task_finish<false>(gtid, task, current_task);
1576  }
1577 
1578  KA_TRACE(
1579  30,
1580  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1581  gtid, taskdata, current_task));
1582  return;
1583 }
1584 
1585 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1586 //
1587 // loc_ref: location of original task pragma (ignored)
1588 // gtid: Global Thread ID of encountering thread
1589 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1590 // Returns:
1591 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1592 // be resumed later.
1593 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1594 // resumed later.
1595 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1596  kmp_task_t *new_task) {
1597  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1598 
1599  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1600  loc_ref, new_taskdata));
1601 
1602 #if OMPT_SUPPORT
1603  kmp_taskdata_t *parent;
1604  if (UNLIKELY(ompt_enabled.enabled)) {
1605  parent = new_taskdata->td_parent;
1606  if (ompt_enabled.ompt_callback_task_create) {
1607  ompt_data_t task_data = ompt_data_none;
1608  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1609  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1610  parent ? &(parent->ompt_task_info.frame) : NULL,
1611  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1612  OMPT_GET_RETURN_ADDRESS(0));
1613  }
1614  }
1615 #endif
1616 
1617  /* Should we execute the new task or queue it? For now, let's just always try
1618  to queue it. If the queue fills up, then we'll execute it. */
1619 
1620  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1621  { // Execute this task immediately
1622  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1623  new_taskdata->td_flags.task_serial = 1;
1624  __kmp_invoke_task(gtid, new_task, current_task);
1625  }
1626 
1627  KA_TRACE(
1628  10,
1629  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1630  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1631  gtid, loc_ref, new_taskdata));
1632 
1633  ANNOTATE_HAPPENS_BEFORE(new_task);
1634 #if OMPT_SUPPORT
1635  if (UNLIKELY(ompt_enabled.enabled)) {
1636  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1637  }
1638 #endif
1639  return TASK_CURRENT_NOT_QUEUED;
1640 }
1641 
1642 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1643 //
1644 // gtid: Global Thread ID of encountering thread
1645 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1646 // serialize_immediate: if TRUE then if the task is executed immediately its
1647 // execution will be serialized
1648 // Returns:
1649 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1650 // be resumed later.
1651 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1652 // resumed later.
1653 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1654  bool serialize_immediate) {
1655  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1656 
1657  /* Should we execute the new task or queue it? For now, let's just always try
1658  to queue it. If the queue fills up, then we'll execute it. */
1659  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1660  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1661  { // Execute this task immediately
1662  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1663  if (serialize_immediate)
1664  new_taskdata->td_flags.task_serial = 1;
1665  __kmp_invoke_task(gtid, new_task, current_task);
1666  }
1667 
1668  ANNOTATE_HAPPENS_BEFORE(new_task);
1669  return TASK_CURRENT_NOT_QUEUED;
1670 }
1671 
1672 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1673 // non-thread-switchable task from the parent thread only!
1674 //
1675 // loc_ref: location of original task pragma (ignored)
1676 // gtid: Global Thread ID of encountering thread
1677 // new_task: non-thread-switchable task thunk allocated by
1678 // __kmp_omp_task_alloc()
1679 // Returns:
1680 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1681 // be resumed later.
1682 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1683 // resumed later.
1684 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1685  kmp_task_t *new_task) {
1686  kmp_int32 res;
1687  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1688 
1689 #if KMP_DEBUG || OMPT_SUPPORT
1690  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1691 #endif
1692  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1693  new_taskdata));
1694 
1695 #if OMPT_SUPPORT
1696  kmp_taskdata_t *parent = NULL;
1697  if (UNLIKELY(ompt_enabled.enabled)) {
1698  if (!new_taskdata->td_flags.started) {
1699  OMPT_STORE_RETURN_ADDRESS(gtid);
1700  parent = new_taskdata->td_parent;
1701  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1702  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1703  }
1704  if (ompt_enabled.ompt_callback_task_create) {
1705  ompt_data_t task_data = ompt_data_none;
1706  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1707  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1708  parent ? &(parent->ompt_task_info.frame) : NULL,
1709  &(new_taskdata->ompt_task_info.task_data),
1710  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1711  OMPT_LOAD_RETURN_ADDRESS(gtid));
1712  }
1713  } else {
1714  // We are scheduling the continuation of an UNTIED task.
1715  // Scheduling back to the parent task.
1716  __ompt_task_finish(new_task,
1717  new_taskdata->ompt_task_info.scheduling_parent,
1718  ompt_task_switch);
1719  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1720  }
1721  }
1722 #endif
1723 
1724  res = __kmp_omp_task(gtid, new_task, true);
1725 
1726  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1727  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1728  gtid, loc_ref, new_taskdata));
1729 #if OMPT_SUPPORT
1730  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1731  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1732  }
1733 #endif
1734  return res;
1735 }
1736 
1737 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1738 // a taskloop task with the correct OMPT return address
1739 //
1740 // loc_ref: location of original task pragma (ignored)
1741 // gtid: Global Thread ID of encountering thread
1742 // new_task: non-thread-switchable task thunk allocated by
1743 // __kmp_omp_task_alloc()
1744 // codeptr_ra: return address for OMPT callback
1745 // Returns:
1746 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1747 // be resumed later.
1748 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1749 // resumed later.
1750 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1751  kmp_task_t *new_task, void *codeptr_ra) {
1752  kmp_int32 res;
1753  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1754 
1755 #if KMP_DEBUG || OMPT_SUPPORT
1756  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1757 #endif
1758  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1759  new_taskdata));
1760 
1761 #if OMPT_SUPPORT
1762  kmp_taskdata_t *parent = NULL;
1763  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1764  parent = new_taskdata->td_parent;
1765  if (!parent->ompt_task_info.frame.enter_frame.ptr)
1766  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1767  if (ompt_enabled.ompt_callback_task_create) {
1768  ompt_data_t task_data = ompt_data_none;
1769  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1770  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1771  parent ? &(parent->ompt_task_info.frame) : NULL,
1772  &(new_taskdata->ompt_task_info.task_data),
1773  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1774  codeptr_ra);
1775  }
1776  }
1777 #endif
1778 
1779  res = __kmp_omp_task(gtid, new_task, true);
1780 
1781  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1782  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1783  gtid, loc_ref, new_taskdata));
1784 #if OMPT_SUPPORT
1785  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1786  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1787  }
1788 #endif
1789  return res;
1790 }
1791 
1792 template <bool ompt>
1793 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1794  void *frame_address,
1795  void *return_address) {
1796  kmp_taskdata_t *taskdata;
1797  kmp_info_t *thread;
1798  int thread_finished = FALSE;
1799  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1800 
1801  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1802 
1803  if (__kmp_tasking_mode != tskm_immediate_exec) {
1804  thread = __kmp_threads[gtid];
1805  taskdata = thread->th.th_current_task;
1806 
1807 #if OMPT_SUPPORT && OMPT_OPTIONAL
1808  ompt_data_t *my_task_data;
1809  ompt_data_t *my_parallel_data;
1810 
1811  if (ompt) {
1812  my_task_data = &(taskdata->ompt_task_info.task_data);
1813  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1814 
1815  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1816 
1817  if (ompt_enabled.ompt_callback_sync_region) {
1818  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1819  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1820  my_task_data, return_address);
1821  }
1822 
1823  if (ompt_enabled.ompt_callback_sync_region_wait) {
1824  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1825  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1826  my_task_data, return_address);
1827  }
1828  }
1829 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1830 
1831 // Debugger: The taskwait is active. Store location and thread encountered the
1832 // taskwait.
1833 #if USE_ITT_BUILD
1834 // Note: These values are used by ITT events as well.
1835 #endif /* USE_ITT_BUILD */
1836  taskdata->td_taskwait_counter += 1;
1837  taskdata->td_taskwait_ident = loc_ref;
1838  taskdata->td_taskwait_thread = gtid + 1;
1839 
1840 #if USE_ITT_BUILD
1841  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1842  if (itt_sync_obj != NULL)
1843  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1844 #endif /* USE_ITT_BUILD */
1845 
1846  bool must_wait =
1847  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1848 
1849  must_wait = must_wait || (thread->th.th_task_team != NULL &&
1850  thread->th.th_task_team->tt.tt_found_proxy_tasks);
1851  if (must_wait) {
1852  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1853  &(taskdata->td_incomplete_child_tasks)),
1854  0U);
1855  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1856  flag.execute_tasks(thread, gtid, FALSE,
1857  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1858  __kmp_task_stealing_constraint);
1859  }
1860  }
1861 #if USE_ITT_BUILD
1862  if (itt_sync_obj != NULL)
1863  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1864 #endif /* USE_ITT_BUILD */
1865 
1866  // Debugger: The taskwait is completed. Location remains, but thread is
1867  // negated.
1868  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1869 
1870 #if OMPT_SUPPORT && OMPT_OPTIONAL
1871  if (ompt) {
1872  if (ompt_enabled.ompt_callback_sync_region_wait) {
1873  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1874  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1875  my_task_data, return_address);
1876  }
1877  if (ompt_enabled.ompt_callback_sync_region) {
1878  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1879  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1880  my_task_data, return_address);
1881  }
1882  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1883  }
1884 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1885 
1886  ANNOTATE_HAPPENS_AFTER(taskdata);
1887  }
1888 
1889  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1890  "returning TASK_CURRENT_NOT_QUEUED\n",
1891  gtid, taskdata));
1892 
1893  return TASK_CURRENT_NOT_QUEUED;
1894 }
1895 
1896 #if OMPT_SUPPORT && OMPT_OPTIONAL
1897 OMPT_NOINLINE
1898 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1899  void *frame_address,
1900  void *return_address) {
1901  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1902  return_address);
1903 }
1904 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1905 
1906 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1907 // complete
1908 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1909 #if OMPT_SUPPORT && OMPT_OPTIONAL
1910  if (UNLIKELY(ompt_enabled.enabled)) {
1911  OMPT_STORE_RETURN_ADDRESS(gtid);
1912  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1913  OMPT_LOAD_RETURN_ADDRESS(gtid));
1914  }
1915 #endif
1916  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1917 }
1918 
1919 // __kmpc_omp_taskyield: switch to a different task
1920 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1921  kmp_taskdata_t *taskdata;
1922  kmp_info_t *thread;
1923  int thread_finished = FALSE;
1924 
1925  KMP_COUNT_BLOCK(OMP_TASKYIELD);
1926  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1927 
1928  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1929  gtid, loc_ref, end_part));
1930 
1931  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1932  thread = __kmp_threads[gtid];
1933  taskdata = thread->th.th_current_task;
1934 // Should we model this as a task wait or not?
1935 // Debugger: The taskwait is active. Store location and thread encountered the
1936 // taskwait.
1937 #if USE_ITT_BUILD
1938 // Note: These values are used by ITT events as well.
1939 #endif /* USE_ITT_BUILD */
1940  taskdata->td_taskwait_counter += 1;
1941  taskdata->td_taskwait_ident = loc_ref;
1942  taskdata->td_taskwait_thread = gtid + 1;
1943 
1944 #if USE_ITT_BUILD
1945  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1946  if (itt_sync_obj != NULL)
1947  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1948 #endif /* USE_ITT_BUILD */
1949  if (!taskdata->td_flags.team_serial) {
1950  kmp_task_team_t *task_team = thread->th.th_task_team;
1951  if (task_team != NULL) {
1952  if (KMP_TASKING_ENABLED(task_team)) {
1953 #if OMPT_SUPPORT
1954  if (UNLIKELY(ompt_enabled.enabled))
1955  thread->th.ompt_thread_info.ompt_task_yielded = 1;
1956 #endif
1957  __kmp_execute_tasks_32(
1958  thread, gtid, NULL, FALSE,
1959  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1960  __kmp_task_stealing_constraint);
1961 #if OMPT_SUPPORT
1962  if (UNLIKELY(ompt_enabled.enabled))
1963  thread->th.ompt_thread_info.ompt_task_yielded = 0;
1964 #endif
1965  }
1966  }
1967  }
1968 #if USE_ITT_BUILD
1969  if (itt_sync_obj != NULL)
1970  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1971 #endif /* USE_ITT_BUILD */
1972 
1973  // Debugger: The taskwait is completed. Location remains, but thread is
1974  // negated.
1975  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1976  }
1977 
1978  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1979  "returning TASK_CURRENT_NOT_QUEUED\n",
1980  gtid, taskdata));
1981 
1982  return TASK_CURRENT_NOT_QUEUED;
1983 }
1984 
1985 // Task Reduction implementation
1986 //
1987 // Note: initial implementation didn't take into account the possibility
1988 // to specify omp_orig for initializer of the UDR (user defined reduction).
1989 // Corrected implementation takes into account the omp_orig object.
1990 // Compiler is free to use old implementation if omp_orig is not specified.
1991 
2000 typedef struct kmp_taskred_flags {
2002  unsigned lazy_priv : 1;
2003  unsigned reserved31 : 31;
2005 
2009 typedef struct kmp_task_red_input {
2010  void *reduce_shar;
2011  size_t reduce_size;
2012  // three compiler-generated routines (init, fini are optional):
2013  void *reduce_init;
2014  void *reduce_fini;
2015  void *reduce_comb;
2018 
2022 typedef struct kmp_taskred_data {
2023  void *reduce_shar;
2024  size_t reduce_size;
2026  void *reduce_priv;
2027  void *reduce_pend;
2028  // three compiler-generated routines (init, fini are optional):
2029  void *reduce_comb;
2030  void *reduce_init;
2031  void *reduce_fini;
2032  void *reduce_orig;
2034 
2040 typedef struct kmp_taskred_input {
2041  void *reduce_shar;
2042  void *reduce_orig;
2043  size_t reduce_size;
2044  // three compiler-generated routines (init, fini are optional):
2045  void *reduce_init;
2046  void *reduce_fini;
2047  void *reduce_comb;
2054 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2055 template <>
2056 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2057  kmp_task_red_input_t &src) {
2058  item.reduce_orig = NULL;
2059 }
2060 template <>
2061 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2062  kmp_taskred_input_t &src) {
2063  if (src.reduce_orig != NULL) {
2064  item.reduce_orig = src.reduce_orig;
2065  } else {
2066  item.reduce_orig = src.reduce_shar;
2067  } // non-NULL reduce_orig means new interface used
2068 }
2069 
2070 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
2071 template <>
2072 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2073  int offset) {
2074  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2075 }
2076 template <>
2077 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2078  int offset) {
2079  ((void (*)(void *, void *))item.reduce_init)(
2080  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2081 }
2082 
2083 template <typename T>
2084 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2085  kmp_info_t *thread = __kmp_threads[gtid];
2086  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2087  kmp_int32 nth = thread->th.th_team_nproc;
2088  kmp_taskred_data_t *arr;
2089 
2090  // check input data just in case
2091  KMP_ASSERT(tg != NULL);
2092  KMP_ASSERT(data != NULL);
2093  KMP_ASSERT(num > 0);
2094  if (nth == 1) {
2095  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2096  gtid, tg));
2097  return (void *)tg;
2098  }
2099  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2100  gtid, tg, num));
2101  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2102  thread, num * sizeof(kmp_taskred_data_t));
2103  for (int i = 0; i < num; ++i) {
2104  size_t size = data[i].reduce_size - 1;
2105  // round the size up to cache line per thread-specific item
2106  size += CACHE_LINE - size % CACHE_LINE;
2107  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2108  arr[i].reduce_shar = data[i].reduce_shar;
2109  arr[i].reduce_size = size;
2110  arr[i].flags = data[i].flags;
2111  arr[i].reduce_comb = data[i].reduce_comb;
2112  arr[i].reduce_init = data[i].reduce_init;
2113  arr[i].reduce_fini = data[i].reduce_fini;
2114  __kmp_assign_orig<T>(arr[i], data[i]);
2115  if (!arr[i].flags.lazy_priv) {
2116  // allocate cache-line aligned block and fill it with zeros
2117  arr[i].reduce_priv = __kmp_allocate(nth * size);
2118  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2119  if (arr[i].reduce_init != NULL) {
2120  // initialize all thread-specific items
2121  for (int j = 0; j < nth; ++j) {
2122  __kmp_call_init<T>(arr[i], j * size);
2123  }
2124  }
2125  } else {
2126  // only allocate space for pointers now,
2127  // objects will be lazily allocated/initialized if/when requested
2128  // note that __kmp_allocate zeroes the allocated memory
2129  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2130  }
2131  }
2132  tg->reduce_data = (void *)arr;
2133  tg->reduce_num_data = num;
2134  return (void *)tg;
2135 }
2136 
2151 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2152  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2153 }
2154 
2167 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2168  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2169 }
2170 
2171 // Copy task reduction data (except for shared pointers).
2172 template <typename T>
2173 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2174  kmp_taskgroup_t *tg, void *reduce_data) {
2175  kmp_taskred_data_t *arr;
2176  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2177  " from data %p\n",
2178  thr, tg, reduce_data));
2179  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2180  thr, num * sizeof(kmp_taskred_data_t));
2181  // threads will share private copies, thunk routines, sizes, flags, etc.:
2182  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2183  for (int i = 0; i < num; ++i) {
2184  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2185  }
2186  tg->reduce_data = (void *)arr;
2187  tg->reduce_num_data = num;
2188 }
2189 
2199 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2200  kmp_info_t *thread = __kmp_threads[gtid];
2201  kmp_int32 nth = thread->th.th_team_nproc;
2202  if (nth == 1)
2203  return data; // nothing to do
2204 
2205  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2206  if (tg == NULL)
2207  tg = thread->th.th_current_task->td_taskgroup;
2208  KMP_ASSERT(tg != NULL);
2209  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2210  kmp_int32 num = tg->reduce_num_data;
2211  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2212 
2213  KMP_ASSERT(data != NULL);
2214  while (tg != NULL) {
2215  for (int i = 0; i < num; ++i) {
2216  if (!arr[i].flags.lazy_priv) {
2217  if (data == arr[i].reduce_shar ||
2218  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2219  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2220  } else {
2221  // check shared location first
2222  void **p_priv = (void **)(arr[i].reduce_priv);
2223  if (data == arr[i].reduce_shar)
2224  goto found;
2225  // check if we get some thread specific location as parameter
2226  for (int j = 0; j < nth; ++j)
2227  if (data == p_priv[j])
2228  goto found;
2229  continue; // not found, continue search
2230  found:
2231  if (p_priv[tid] == NULL) {
2232  // allocate thread specific object lazily
2233  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2234  if (arr[i].reduce_init != NULL) {
2235  if (arr[i].reduce_orig != NULL) { // new interface
2236  ((void (*)(void *, void *))arr[i].reduce_init)(
2237  p_priv[tid], arr[i].reduce_orig);
2238  } else { // old interface (single parameter)
2239  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2240  }
2241  }
2242  }
2243  return p_priv[tid];
2244  }
2245  }
2246  tg = tg->parent;
2247  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2248  num = tg->reduce_num_data;
2249  }
2250  KMP_ASSERT2(0, "Unknown task reduction item");
2251  return NULL; // ERROR, this line never executed
2252 }
2253 
2254 // Finalize task reduction.
2255 // Called from __kmpc_end_taskgroup()
2256 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2257  kmp_int32 nth = th->th.th_team_nproc;
2258  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2259  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2260  kmp_int32 num = tg->reduce_num_data;
2261  for (int i = 0; i < num; ++i) {
2262  void *sh_data = arr[i].reduce_shar;
2263  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2264  void (*f_comb)(void *, void *) =
2265  (void (*)(void *, void *))(arr[i].reduce_comb);
2266  if (!arr[i].flags.lazy_priv) {
2267  void *pr_data = arr[i].reduce_priv;
2268  size_t size = arr[i].reduce_size;
2269  for (int j = 0; j < nth; ++j) {
2270  void *priv_data = (char *)pr_data + j * size;
2271  f_comb(sh_data, priv_data); // combine results
2272  if (f_fini)
2273  f_fini(priv_data); // finalize if needed
2274  }
2275  } else {
2276  void **pr_data = (void **)(arr[i].reduce_priv);
2277  for (int j = 0; j < nth; ++j) {
2278  if (pr_data[j] != NULL) {
2279  f_comb(sh_data, pr_data[j]); // combine results
2280  if (f_fini)
2281  f_fini(pr_data[j]); // finalize if needed
2282  __kmp_free(pr_data[j]);
2283  }
2284  }
2285  }
2286  __kmp_free(arr[i].reduce_priv);
2287  }
2288  __kmp_thread_free(th, arr);
2289  tg->reduce_data = NULL;
2290  tg->reduce_num_data = 0;
2291 }
2292 
2293 // Cleanup task reduction data for parallel or worksharing,
2294 // do not touch task private data other threads still working with.
2295 // Called from __kmpc_end_taskgroup()
2296 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2297  __kmp_thread_free(th, tg->reduce_data);
2298  tg->reduce_data = NULL;
2299  tg->reduce_num_data = 0;
2300 }
2301 
2302 template <typename T>
2303 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2304  int num, T *data) {
2305  kmp_info_t *thr = __kmp_threads[gtid];
2306  kmp_int32 nth = thr->th.th_team_nproc;
2307  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2308  if (nth == 1) {
2309  KA_TRACE(10,
2310  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2311  gtid, thr->th.th_current_task->td_taskgroup));
2312  return (void *)thr->th.th_current_task->td_taskgroup;
2313  }
2314  kmp_team_t *team = thr->th.th_team;
2315  void *reduce_data;
2316  kmp_taskgroup_t *tg;
2317  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2318  if (reduce_data == NULL &&
2319  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2320  (void *)1)) {
2321  // single thread enters this block to initialize common reduction data
2322  KMP_DEBUG_ASSERT(reduce_data == NULL);
2323  // first initialize own data, then make a copy other threads can use
2324  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2325  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2326  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2327  // fini counters should be 0 at this point
2328  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2329  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2330  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2331  } else {
2332  while (
2333  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2334  (void *)1) { // wait for task reduction initialization
2335  KMP_CPU_PAUSE();
2336  }
2337  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2338  tg = thr->th.th_current_task->td_taskgroup;
2339  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2340  }
2341  return tg;
2342 }
2343 
2360 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2361  int num, void *data) {
2362  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2363  (kmp_task_red_input_t *)data);
2364 }
2365 
2380 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2381  void *data) {
2382  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2383  (kmp_taskred_input_t *)data);
2384 }
2385 
2394 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2395  __kmpc_end_taskgroup(loc, gtid);
2396 }
2397 
2398 // __kmpc_taskgroup: Start a new taskgroup
2399 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2400  kmp_info_t *thread = __kmp_threads[gtid];
2401  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2402  kmp_taskgroup_t *tg_new =
2403  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2404  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2405  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2406  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2407  tg_new->parent = taskdata->td_taskgroup;
2408  tg_new->reduce_data = NULL;
2409  tg_new->reduce_num_data = 0;
2410  taskdata->td_taskgroup = tg_new;
2411 
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL
2413  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2414  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2415  if (!codeptr)
2416  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2417  kmp_team_t *team = thread->th.th_team;
2418  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2419  // FIXME: I think this is wrong for lwt!
2420  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2421 
2422  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2423  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2424  &(my_task_data), codeptr);
2425  }
2426 #endif
2427 }
2428 
2429 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2430 // and its descendants are complete
2431 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2432  kmp_info_t *thread = __kmp_threads[gtid];
2433  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2434  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2435  int thread_finished = FALSE;
2436 
2437 #if OMPT_SUPPORT && OMPT_OPTIONAL
2438  kmp_team_t *team;
2439  ompt_data_t my_task_data;
2440  ompt_data_t my_parallel_data;
2441  void *codeptr;
2442  if (UNLIKELY(ompt_enabled.enabled)) {
2443  team = thread->th.th_team;
2444  my_task_data = taskdata->ompt_task_info.task_data;
2445  // FIXME: I think this is wrong for lwt!
2446  my_parallel_data = team->t.ompt_team_info.parallel_data;
2447  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2448  if (!codeptr)
2449  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2450  }
2451 #endif
2452 
2453  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2454  KMP_DEBUG_ASSERT(taskgroup != NULL);
2455  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2456 
2457  if (__kmp_tasking_mode != tskm_immediate_exec) {
2458  // mark task as waiting not on a barrier
2459  taskdata->td_taskwait_counter += 1;
2460  taskdata->td_taskwait_ident = loc;
2461  taskdata->td_taskwait_thread = gtid + 1;
2462 #if USE_ITT_BUILD
2463  // For ITT the taskgroup wait is similar to taskwait until we need to
2464  // distinguish them
2465  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2466  if (itt_sync_obj != NULL)
2467  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2468 #endif /* USE_ITT_BUILD */
2469 
2470 #if OMPT_SUPPORT && OMPT_OPTIONAL
2471  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2472  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2473  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2474  &(my_task_data), codeptr);
2475  }
2476 #endif
2477 
2478  if (!taskdata->td_flags.team_serial ||
2479  (thread->th.th_task_team != NULL &&
2480  thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
2481  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2482  0U);
2483  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2484  flag.execute_tasks(thread, gtid, FALSE,
2485  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2486  __kmp_task_stealing_constraint);
2487  }
2488  }
2489  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2490 
2491 #if OMPT_SUPPORT && OMPT_OPTIONAL
2492  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2493  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2494  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2495  &(my_task_data), codeptr);
2496  }
2497 #endif
2498 
2499 #if USE_ITT_BUILD
2500  if (itt_sync_obj != NULL)
2501  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2502 #endif /* USE_ITT_BUILD */
2503  }
2504  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2505 
2506  if (taskgroup->reduce_data != NULL) { // need to reduce?
2507  int cnt;
2508  void *reduce_data;
2509  kmp_team_t *t = thread->th.th_team;
2510  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2511  // check if <priv> data of the first reduction variable shared for the team
2512  void *priv0 = arr[0].reduce_priv;
2513  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2514  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2515  // finishing task reduction on parallel
2516  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2517  if (cnt == thread->th.th_team_nproc - 1) {
2518  // we are the last thread passing __kmpc_reduction_modifier_fini()
2519  // finalize task reduction:
2520  __kmp_task_reduction_fini(thread, taskgroup);
2521  // cleanup fields in the team structure:
2522  // TODO: is relaxed store enough here (whole barrier should follow)?
2523  __kmp_thread_free(thread, reduce_data);
2524  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2525  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2526  } else {
2527  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2528  // so do not finalize reduction, just clean own copy of the data
2529  __kmp_task_reduction_clean(thread, taskgroup);
2530  }
2531  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2532  NULL &&
2533  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2534  // finishing task reduction on worksharing
2535  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2536  if (cnt == thread->th.th_team_nproc - 1) {
2537  // we are the last thread passing __kmpc_reduction_modifier_fini()
2538  __kmp_task_reduction_fini(thread, taskgroup);
2539  // cleanup fields in team structure:
2540  // TODO: is relaxed store enough here (whole barrier should follow)?
2541  __kmp_thread_free(thread, reduce_data);
2542  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2543  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2544  } else {
2545  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2546  // so do not finalize reduction, just clean own copy of the data
2547  __kmp_task_reduction_clean(thread, taskgroup);
2548  }
2549  } else {
2550  // finishing task reduction on taskgroup
2551  __kmp_task_reduction_fini(thread, taskgroup);
2552  }
2553  }
2554  // Restore parent taskgroup for the current task
2555  taskdata->td_taskgroup = taskgroup->parent;
2556  __kmp_thread_free(thread, taskgroup);
2557 
2558  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2559  gtid, taskdata));
2560  ANNOTATE_HAPPENS_AFTER(taskdata);
2561 
2562 #if OMPT_SUPPORT && OMPT_OPTIONAL
2563  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2564  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2565  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2566  &(my_task_data), codeptr);
2567  }
2568 #endif
2569 }
2570 
2571 // __kmp_remove_my_task: remove a task from my own deque
2572 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2573  kmp_task_team_t *task_team,
2574  kmp_int32 is_constrained) {
2575  kmp_task_t *task;
2576  kmp_taskdata_t *taskdata;
2577  kmp_thread_data_t *thread_data;
2578  kmp_uint32 tail;
2579 
2580  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2581  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2582  NULL); // Caller should check this condition
2583 
2584  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2585 
2586  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2587  gtid, thread_data->td.td_deque_ntasks,
2588  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2589 
2590  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2591  KA_TRACE(10,
2592  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2593  "ntasks=%d head=%u tail=%u\n",
2594  gtid, thread_data->td.td_deque_ntasks,
2595  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2596  return NULL;
2597  }
2598 
2599  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2600 
2601  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2602  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2603  KA_TRACE(10,
2604  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2605  "ntasks=%d head=%u tail=%u\n",
2606  gtid, thread_data->td.td_deque_ntasks,
2607  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2608  return NULL;
2609  }
2610 
2611  tail = (thread_data->td.td_deque_tail - 1) &
2612  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2613  taskdata = thread_data->td.td_deque[tail];
2614 
2615  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2616  thread->th.th_current_task)) {
2617  // The TSC does not allow to steal victim task
2618  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2619  KA_TRACE(10,
2620  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2621  "ntasks=%d head=%u tail=%u\n",
2622  gtid, thread_data->td.td_deque_ntasks,
2623  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2624  return NULL;
2625  }
2626 
2627  thread_data->td.td_deque_tail = tail;
2628  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2629 
2630  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2631 
2632  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2633  "ntasks=%d head=%u tail=%u\n",
2634  gtid, taskdata, thread_data->td.td_deque_ntasks,
2635  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2636 
2637  task = KMP_TASKDATA_TO_TASK(taskdata);
2638  return task;
2639 }
2640 
2641 // __kmp_steal_task: remove a task from another thread's deque
2642 // Assume that calling thread has already checked existence of
2643 // task_team thread_data before calling this routine.
2644 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2645  kmp_task_team_t *task_team,
2646  std::atomic<kmp_int32> *unfinished_threads,
2647  int *thread_finished,
2648  kmp_int32 is_constrained) {
2649  kmp_task_t *task;
2650  kmp_taskdata_t *taskdata;
2651  kmp_taskdata_t *current;
2652  kmp_thread_data_t *victim_td, *threads_data;
2653  kmp_int32 target;
2654  kmp_int32 victim_tid;
2655 
2656  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2657 
2658  threads_data = task_team->tt.tt_threads_data;
2659  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2660 
2661  victim_tid = victim_thr->th.th_info.ds.ds_tid;
2662  victim_td = &threads_data[victim_tid];
2663 
2664  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2665  "task_team=%p ntasks=%d head=%u tail=%u\n",
2666  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2667  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2668  victim_td->td.td_deque_tail));
2669 
2670  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2671  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2672  "task_team=%p ntasks=%d head=%u tail=%u\n",
2673  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2674  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2675  victim_td->td.td_deque_tail));
2676  return NULL;
2677  }
2678 
2679  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2680 
2681  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2682  // Check again after we acquire the lock
2683  if (ntasks == 0) {
2684  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2685  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2686  "task_team=%p ntasks=%d head=%u tail=%u\n",
2687  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2688  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2689  return NULL;
2690  }
2691 
2692  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2693  current = __kmp_threads[gtid]->th.th_current_task;
2694  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2695  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2696  // Bump head pointer and Wrap.
2697  victim_td->td.td_deque_head =
2698  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2699  } else {
2700  if (!task_team->tt.tt_untied_task_encountered) {
2701  // The TSC does not allow to steal victim task
2702  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2703  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2704  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2705  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2706  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2707  return NULL;
2708  }
2709  int i;
2710  // walk through victim's deque trying to steal any task
2711  target = victim_td->td.td_deque_head;
2712  taskdata = NULL;
2713  for (i = 1; i < ntasks; ++i) {
2714  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2715  taskdata = victim_td->td.td_deque[target];
2716  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2717  break; // found victim task
2718  } else {
2719  taskdata = NULL;
2720  }
2721  }
2722  if (taskdata == NULL) {
2723  // No appropriate candidate to steal found
2724  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2725  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2726  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2727  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2728  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2729  return NULL;
2730  }
2731  int prev = target;
2732  for (i = i + 1; i < ntasks; ++i) {
2733  // shift remaining tasks in the deque left by 1
2734  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2735  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2736  prev = target;
2737  }
2738  KMP_DEBUG_ASSERT(
2739  victim_td->td.td_deque_tail ==
2740  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2741  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2742  }
2743  if (*thread_finished) {
2744  // We need to un-mark this victim as a finished victim. This must be done
2745  // before releasing the lock, or else other threads (starting with the
2746  // master victim) might be prematurely released from the barrier!!!
2747  kmp_int32 count;
2748 
2749  count = KMP_ATOMIC_INC(unfinished_threads);
2750 
2751  KA_TRACE(
2752  20,
2753  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2754  gtid, count + 1, task_team));
2755 
2756  *thread_finished = FALSE;
2757  }
2758  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2759 
2760  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2761 
2762  KMP_COUNT_BLOCK(TASK_stolen);
2763  KA_TRACE(10,
2764  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2765  "task_team=%p ntasks=%d head=%u tail=%u\n",
2766  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2767  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2768 
2769  task = KMP_TASKDATA_TO_TASK(taskdata);
2770  return task;
2771 }
2772 
2773 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2774 // condition is statisfied (return true) or there are none left (return false).
2775 //
2776 // final_spin is TRUE if this is the spin at the release barrier.
2777 // thread_finished indicates whether the thread is finished executing all
2778 // the tasks it has on its deque, and is at the release barrier.
2779 // spinner is the location on which to spin.
2780 // spinner == NULL means only execute a single task and return.
2781 // checker is the value to check to terminate the spin.
2782 template <class C>
2783 static inline int __kmp_execute_tasks_template(
2784  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2785  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2786  kmp_int32 is_constrained) {
2787  kmp_task_team_t *task_team = thread->th.th_task_team;
2788  kmp_thread_data_t *threads_data;
2789  kmp_task_t *task;
2790  kmp_info_t *other_thread;
2791  kmp_taskdata_t *current_task = thread->th.th_current_task;
2792  std::atomic<kmp_int32> *unfinished_threads;
2793  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2794  tid = thread->th.th_info.ds.ds_tid;
2795 
2796  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2797  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2798 
2799  if (task_team == NULL || current_task == NULL)
2800  return FALSE;
2801 
2802  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2803  "*thread_finished=%d\n",
2804  gtid, final_spin, *thread_finished));
2805 
2806  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2807  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2808  KMP_DEBUG_ASSERT(threads_data != NULL);
2809 
2810  nthreads = task_team->tt.tt_nproc;
2811  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2812  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2813  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2814 
2815  while (1) { // Outer loop keeps trying to find tasks in case of single thread
2816  // getting tasks from target constructs
2817  while (1) { // Inner loop to find a task and execute it
2818  task = NULL;
2819  if (use_own_tasks) { // check on own queue first
2820  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2821  }
2822  if ((task == NULL) && (nthreads > 1)) { // Steal a task
2823  int asleep = 1;
2824  use_own_tasks = 0;
2825  // Try to steal from the last place I stole from successfully.
2826  if (victim_tid == -2) { // haven't stolen anything yet
2827  victim_tid = threads_data[tid].td.td_deque_last_stolen;
2828  if (victim_tid !=
2829  -1) // if we have a last stolen from victim, get the thread
2830  other_thread = threads_data[victim_tid].td.td_thr;
2831  }
2832  if (victim_tid != -1) { // found last victim
2833  asleep = 0;
2834  } else if (!new_victim) { // no recent steals and we haven't already
2835  // used a new victim; select a random thread
2836  do { // Find a different thread to steal work from.
2837  // Pick a random thread. Initial plan was to cycle through all the
2838  // threads, and only return if we tried to steal from every thread,
2839  // and failed. Arch says that's not such a great idea.
2840  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2841  if (victim_tid >= tid) {
2842  ++victim_tid; // Adjusts random distribution to exclude self
2843  }
2844  // Found a potential victim
2845  other_thread = threads_data[victim_tid].td.td_thr;
2846  // There is a slight chance that __kmp_enable_tasking() did not wake
2847  // up all threads waiting at the barrier. If victim is sleeping,
2848  // then wake it up. Since we were going to pay the cache miss
2849  // penalty for referencing another thread's kmp_info_t struct
2850  // anyway,
2851  // the check shouldn't cost too much performance at this point. In
2852  // extra barrier mode, tasks do not sleep at the separate tasking
2853  // barrier, so this isn't a problem.
2854  asleep = 0;
2855  if ((__kmp_tasking_mode == tskm_task_teams) &&
2856  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2857  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2858  NULL)) {
2859  asleep = 1;
2860  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2861  other_thread->th.th_sleep_loc);
2862  // A sleeping thread should not have any tasks on it's queue.
2863  // There is a slight possibility that it resumes, steals a task
2864  // from another thread, which spawns more tasks, all in the time
2865  // that it takes this thread to check => don't write an assertion
2866  // that the victim's queue is empty. Try stealing from a
2867  // different thread.
2868  }
2869  } while (asleep);
2870  }
2871 
2872  if (!asleep) {
2873  // We have a victim to try to steal from
2874  task = __kmp_steal_task(other_thread, gtid, task_team,
2875  unfinished_threads, thread_finished,
2876  is_constrained);
2877  }
2878  if (task != NULL) { // set last stolen to victim
2879  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2880  threads_data[tid].td.td_deque_last_stolen = victim_tid;
2881  // The pre-refactored code did not try more than 1 successful new
2882  // vicitm, unless the last one generated more local tasks;
2883  // new_victim keeps track of this
2884  new_victim = 1;
2885  }
2886  } else { // No tasks found; unset last_stolen
2887  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2888  victim_tid = -2; // no successful victim found
2889  }
2890  }
2891 
2892  if (task == NULL) // break out of tasking loop
2893  break;
2894 
2895 // Found a task; execute it
2896 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2897  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2898  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2899  // get the object reliably
2900  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2901  }
2902  __kmp_itt_task_starting(itt_sync_obj);
2903  }
2904 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2905  __kmp_invoke_task(gtid, task, current_task);
2906 #if USE_ITT_BUILD
2907  if (itt_sync_obj != NULL)
2908  __kmp_itt_task_finished(itt_sync_obj);
2909 #endif /* USE_ITT_BUILD */
2910  // If this thread is only partway through the barrier and the condition is
2911  // met, then return now, so that the barrier gather/release pattern can
2912  // proceed. If this thread is in the last spin loop in the barrier,
2913  // waiting to be released, we know that the termination condition will not
2914  // be satisified, so don't waste any cycles checking it.
2915  if (flag == NULL || (!final_spin && flag->done_check())) {
2916  KA_TRACE(
2917  15,
2918  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2919  gtid));
2920  return TRUE;
2921  }
2922  if (thread->th.th_task_team == NULL) {
2923  break;
2924  }
2925  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
2926  // If execution of a stolen task results in more tasks being placed on our
2927  // run queue, reset use_own_tasks
2928  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2929  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2930  "other tasks, restart\n",
2931  gtid));
2932  use_own_tasks = 1;
2933  new_victim = 0;
2934  }
2935  }
2936 
2937  // The task source has been exhausted. If in final spin loop of barrier,
2938  // check if termination condition is satisfied. The work queue may be empty
2939  // but there might be proxy tasks still executing.
2940  if (final_spin &&
2941  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
2942  // First, decrement the #unfinished threads, if that has not already been
2943  // done. This decrement might be to the spin location, and result in the
2944  // termination condition being satisfied.
2945  if (!*thread_finished) {
2946  kmp_int32 count;
2947 
2948  count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
2949  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2950  "unfinished_threads to %d task_team=%p\n",
2951  gtid, count, task_team));
2952  *thread_finished = TRUE;
2953  }
2954 
2955  // It is now unsafe to reference thread->th.th_team !!!
2956  // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2957  // thread to pass through the barrier, where it might reset each thread's
2958  // th.th_team field for the next parallel region. If we can steal more
2959  // work, we know that this has not happened yet.
2960  if (flag != NULL && flag->done_check()) {
2961  KA_TRACE(
2962  15,
2963  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2964  gtid));
2965  return TRUE;
2966  }
2967  }
2968 
2969  // If this thread's task team is NULL, master has recognized that there are
2970  // no more tasks; bail out
2971  if (thread->th.th_task_team == NULL) {
2972  KA_TRACE(15,
2973  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2974  return FALSE;
2975  }
2976 
2977  // We could be getting tasks from target constructs; if this is the only
2978  // thread, keep trying to execute tasks from own queue
2979  if (nthreads == 1)
2980  use_own_tasks = 1;
2981  else {
2982  KA_TRACE(15,
2983  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
2984  return FALSE;
2985  }
2986  }
2987 }
2988 
2989 int __kmp_execute_tasks_32(
2990  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2991  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2992  kmp_int32 is_constrained) {
2993  return __kmp_execute_tasks_template(
2994  thread, gtid, flag, final_spin,
2995  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2996 }
2997 
2998 int __kmp_execute_tasks_64(
2999  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
3000  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3001  kmp_int32 is_constrained) {
3002  return __kmp_execute_tasks_template(
3003  thread, gtid, flag, final_spin,
3004  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3005 }
3006 
3007 int __kmp_execute_tasks_oncore(
3008  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3009  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3010  kmp_int32 is_constrained) {
3011  return __kmp_execute_tasks_template(
3012  thread, gtid, flag, final_spin,
3013  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3014 }
3015 
3016 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3017 // next barrier so they can assist in executing enqueued tasks.
3018 // First thread in allocates the task team atomically.
3019 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3020  kmp_info_t *this_thr) {
3021  kmp_thread_data_t *threads_data;
3022  int nthreads, i, is_init_thread;
3023 
3024  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3025  __kmp_gtid_from_thread(this_thr)));
3026 
3027  KMP_DEBUG_ASSERT(task_team != NULL);
3028  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3029 
3030  nthreads = task_team->tt.tt_nproc;
3031  KMP_DEBUG_ASSERT(nthreads > 0);
3032  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3033 
3034  // Allocate or increase the size of threads_data if necessary
3035  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3036 
3037  if (!is_init_thread) {
3038  // Some other thread already set up the array.
3039  KA_TRACE(
3040  20,
3041  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3042  __kmp_gtid_from_thread(this_thr)));
3043  return;
3044  }
3045  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3046  KMP_DEBUG_ASSERT(threads_data != NULL);
3047 
3048  if (__kmp_tasking_mode == tskm_task_teams &&
3049  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3050  // Release any threads sleeping at the barrier, so that they can steal
3051  // tasks and execute them. In extra barrier mode, tasks do not sleep
3052  // at the separate tasking barrier, so this isn't a problem.
3053  for (i = 0; i < nthreads; i++) {
3054  volatile void *sleep_loc;
3055  kmp_info_t *thread = threads_data[i].td.td_thr;
3056 
3057  if (i == this_thr->th.th_info.ds.ds_tid) {
3058  continue;
3059  }
3060  // Since we haven't locked the thread's suspend mutex lock at this
3061  // point, there is a small window where a thread might be putting
3062  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3063  // To work around this, __kmp_execute_tasks_template() periodically checks
3064  // see if other threads are sleeping (using the same random mechanism that
3065  // is used for task stealing) and awakens them if they are.
3066  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3067  NULL) {
3068  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3069  __kmp_gtid_from_thread(this_thr),
3070  __kmp_gtid_from_thread(thread)));
3071  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3072  } else {
3073  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3074  __kmp_gtid_from_thread(this_thr),
3075  __kmp_gtid_from_thread(thread)));
3076  }
3077  }
3078  }
3079 
3080  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3081  __kmp_gtid_from_thread(this_thr)));
3082 }
3083 
3084 /* // TODO: Check the comment consistency
3085  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3086  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3087  * After a child * thread checks into a barrier and calls __kmp_release() from
3088  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3089  * longer assume that the kmp_team_t structure is intact (at any moment, the
3090  * master thread may exit the barrier code and free the team data structure,
3091  * and return the threads to the thread pool).
3092  *
3093  * This does not work with the tasking code, as the thread is still
3094  * expected to participate in the execution of any tasks that may have been
3095  * spawned my a member of the team, and the thread still needs access to all
3096  * to each thread in the team, so that it can steal work from it.
3097  *
3098  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3099  * counting mechanims, and is allocated by the master thread before calling
3100  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3101  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3102  * of the kmp_task_team_t structs for consecutive barriers can overlap
3103  * (and will, unless the master thread is the last thread to exit the barrier
3104  * release phase, which is not typical). The existence of such a struct is
3105  * useful outside the context of tasking.
3106  *
3107  * We currently use the existence of the threads array as an indicator that
3108  * tasks were spawned since the last barrier. If the structure is to be
3109  * useful outside the context of tasking, then this will have to change, but
3110  * not settting the field minimizes the performance impact of tasking on
3111  * barriers, when no explicit tasks were spawned (pushed, actually).
3112  */
3113 
3114 static kmp_task_team_t *__kmp_free_task_teams =
3115  NULL; // Free list for task_team data structures
3116 // Lock for task team data structures
3117 kmp_bootstrap_lock_t __kmp_task_team_lock =
3118  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3119 
3120 // __kmp_alloc_task_deque:
3121 // Allocates a task deque for a particular thread, and initialize the necessary
3122 // data structures relating to the deque. This only happens once per thread
3123 // per task team since task teams are recycled. No lock is needed during
3124 // allocation since each thread allocates its own deque.
3125 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3126  kmp_thread_data_t *thread_data) {
3127  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3128  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3129 
3130  // Initialize last stolen task field to "none"
3131  thread_data->td.td_deque_last_stolen = -1;
3132 
3133  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3134  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3135  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3136 
3137  KE_TRACE(
3138  10,
3139  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3140  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3141  // Allocate space for task deque, and zero the deque
3142  // Cannot use __kmp_thread_calloc() because threads not around for
3143  // kmp_reap_task_team( ).
3144  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3145  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3146  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3147 }
3148 
3149 // __kmp_free_task_deque:
3150 // Deallocates a task deque for a particular thread. Happens at library
3151 // deallocation so don't need to reset all thread data fields.
3152 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3153  if (thread_data->td.td_deque != NULL) {
3154  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3155  TCW_4(thread_data->td.td_deque_ntasks, 0);
3156  __kmp_free(thread_data->td.td_deque);
3157  thread_data->td.td_deque = NULL;
3158  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3159  }
3160 
3161 #ifdef BUILD_TIED_TASK_STACK
3162  // GEH: Figure out what to do here for td_susp_tied_tasks
3163  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3164  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3165  }
3166 #endif // BUILD_TIED_TASK_STACK
3167 }
3168 
3169 // __kmp_realloc_task_threads_data:
3170 // Allocates a threads_data array for a task team, either by allocating an
3171 // initial array or enlarging an existing array. Only the first thread to get
3172 // the lock allocs or enlarges the array and re-initializes the array elements.
3173 // That thread returns "TRUE", the rest return "FALSE".
3174 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3175 // The current size is given by task_team -> tt.tt_max_threads.
3176 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3177  kmp_task_team_t *task_team) {
3178  kmp_thread_data_t **threads_data_p;
3179  kmp_int32 nthreads, maxthreads;
3180  int is_init_thread = FALSE;
3181 
3182  if (TCR_4(task_team->tt.tt_found_tasks)) {
3183  // Already reallocated and initialized.
3184  return FALSE;
3185  }
3186 
3187  threads_data_p = &task_team->tt.tt_threads_data;
3188  nthreads = task_team->tt.tt_nproc;
3189  maxthreads = task_team->tt.tt_max_threads;
3190 
3191  // All threads must lock when they encounter the first task of the implicit
3192  // task region to make sure threads_data fields are (re)initialized before
3193  // used.
3194  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3195 
3196  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3197  // first thread to enable tasking
3198  kmp_team_t *team = thread->th.th_team;
3199  int i;
3200 
3201  is_init_thread = TRUE;
3202  if (maxthreads < nthreads) {
3203 
3204  if (*threads_data_p != NULL) {
3205  kmp_thread_data_t *old_data = *threads_data_p;
3206  kmp_thread_data_t *new_data = NULL;
3207 
3208  KE_TRACE(
3209  10,
3210  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3211  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3212  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3213  // Reallocate threads_data to have more elements than current array
3214  // Cannot use __kmp_thread_realloc() because threads not around for
3215  // kmp_reap_task_team( ). Note all new array entries are initialized
3216  // to zero by __kmp_allocate().
3217  new_data = (kmp_thread_data_t *)__kmp_allocate(
3218  nthreads * sizeof(kmp_thread_data_t));
3219  // copy old data to new data
3220  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3221  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3222 
3223 #ifdef BUILD_TIED_TASK_STACK
3224  // GEH: Figure out if this is the right thing to do
3225  for (i = maxthreads; i < nthreads; i++) {
3226  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3227  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3228  }
3229 #endif // BUILD_TIED_TASK_STACK
3230  // Install the new data and free the old data
3231  (*threads_data_p) = new_data;
3232  __kmp_free(old_data);
3233  } else {
3234  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3235  "threads data for task_team %p, size = %d\n",
3236  __kmp_gtid_from_thread(thread), task_team, nthreads));
3237  // Make the initial allocate for threads_data array, and zero entries
3238  // Cannot use __kmp_thread_calloc() because threads not around for
3239  // kmp_reap_task_team( ).
3240  ANNOTATE_IGNORE_WRITES_BEGIN();
3241  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3242  nthreads * sizeof(kmp_thread_data_t));
3243  ANNOTATE_IGNORE_WRITES_END();
3244 #ifdef BUILD_TIED_TASK_STACK
3245  // GEH: Figure out if this is the right thing to do
3246  for (i = 0; i < nthreads; i++) {
3247  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3248  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3249  }
3250 #endif // BUILD_TIED_TASK_STACK
3251  }
3252  task_team->tt.tt_max_threads = nthreads;
3253  } else {
3254  // If array has (more than) enough elements, go ahead and use it
3255  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3256  }
3257 
3258  // initialize threads_data pointers back to thread_info structures
3259  for (i = 0; i < nthreads; i++) {
3260  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3261  thread_data->td.td_thr = team->t.t_threads[i];
3262 
3263  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3264  // The last stolen field survives across teams / barrier, and the number
3265  // of threads may have changed. It's possible (likely?) that a new
3266  // parallel region will exhibit the same behavior as previous region.
3267  thread_data->td.td_deque_last_stolen = -1;
3268  }
3269  }
3270 
3271  KMP_MB();
3272  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3273  }
3274 
3275  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3276  return is_init_thread;
3277 }
3278 
3279 // __kmp_free_task_threads_data:
3280 // Deallocates a threads_data array for a task team, including any attached
3281 // tasking deques. Only occurs at library shutdown.
3282 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3283  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3284  if (task_team->tt.tt_threads_data != NULL) {
3285  int i;
3286  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3287  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3288  }
3289  __kmp_free(task_team->tt.tt_threads_data);
3290  task_team->tt.tt_threads_data = NULL;
3291  }
3292  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3293 }
3294 
3295 // __kmp_allocate_task_team:
3296 // Allocates a task team associated with a specific team, taking it from
3297 // the global task team free list if possible. Also initializes data
3298 // structures.
3299 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3300  kmp_team_t *team) {
3301  kmp_task_team_t *task_team = NULL;
3302  int nthreads;
3303 
3304  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3305  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3306 
3307  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3308  // Take a task team from the task team pool
3309  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3310  if (__kmp_free_task_teams != NULL) {
3311  task_team = __kmp_free_task_teams;
3312  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3313  task_team->tt.tt_next = NULL;
3314  }
3315  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3316  }
3317 
3318  if (task_team == NULL) {
3319  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3320  "task team for team %p\n",
3321  __kmp_gtid_from_thread(thread), team));
3322  // Allocate a new task team if one is not available.
3323  // Cannot use __kmp_thread_malloc() because threads not around for
3324  // kmp_reap_task_team( ).
3325  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3326  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3327  // AC: __kmp_allocate zeroes returned memory
3328  // task_team -> tt.tt_threads_data = NULL;
3329  // task_team -> tt.tt_max_threads = 0;
3330  // task_team -> tt.tt_next = NULL;
3331  }
3332 
3333  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3334  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3335  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3336 
3337  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3338  TCW_4(task_team->tt.tt_active, TRUE);
3339 
3340  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3341  "unfinished_threads init'd to %d\n",
3342  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3343  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3344  return task_team;
3345 }
3346 
3347 // __kmp_free_task_team:
3348 // Frees the task team associated with a specific thread, and adds it
3349 // to the global task team free list.
3350 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3351  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3352  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3353 
3354  // Put task team back on free list
3355  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3356 
3357  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3358  task_team->tt.tt_next = __kmp_free_task_teams;
3359  TCW_PTR(__kmp_free_task_teams, task_team);
3360 
3361  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3362 }
3363 
3364 // __kmp_reap_task_teams:
3365 // Free all the task teams on the task team free list.
3366 // Should only be done during library shutdown.
3367 // Cannot do anything that needs a thread structure or gtid since they are
3368 // already gone.
3369 void __kmp_reap_task_teams(void) {
3370  kmp_task_team_t *task_team;
3371 
3372  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3373  // Free all task_teams on the free list
3374  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3375  while ((task_team = __kmp_free_task_teams) != NULL) {
3376  __kmp_free_task_teams = task_team->tt.tt_next;
3377  task_team->tt.tt_next = NULL;
3378 
3379  // Free threads_data if necessary
3380  if (task_team->tt.tt_threads_data != NULL) {
3381  __kmp_free_task_threads_data(task_team);
3382  }
3383  __kmp_free(task_team);
3384  }
3385  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3386  }
3387 }
3388 
3389 // __kmp_wait_to_unref_task_teams:
3390 // Some threads could still be in the fork barrier release code, possibly
3391 // trying to steal tasks. Wait for each thread to unreference its task team.
3392 void __kmp_wait_to_unref_task_teams(void) {
3393  kmp_info_t *thread;
3394  kmp_uint32 spins;
3395  int done;
3396 
3397  KMP_INIT_YIELD(spins);
3398 
3399  for (;;) {
3400  done = TRUE;
3401 
3402  // TODO: GEH - this may be is wrong because some sync would be necessary
3403  // in case threads are added to the pool during the traversal. Need to
3404  // verify that lock for thread pool is held when calling this routine.
3405  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3406  thread = thread->th.th_next_pool) {
3407 #if KMP_OS_WINDOWS
3408  DWORD exit_val;
3409 #endif
3410  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3411  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3412  __kmp_gtid_from_thread(thread)));
3413  continue;
3414  }
3415 #if KMP_OS_WINDOWS
3416  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3417  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3418  thread->th.th_task_team = NULL;
3419  continue;
3420  }
3421 #endif
3422 
3423  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3424 
3425  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3426  "unreference task_team\n",
3427  __kmp_gtid_from_thread(thread)));
3428 
3429  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3430  volatile void *sleep_loc;
3431  // If the thread is sleeping, awaken it.
3432  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3433  NULL) {
3434  KA_TRACE(
3435  10,
3436  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3437  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3438  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3439  }
3440  }
3441  }
3442  if (done) {
3443  break;
3444  }
3445 
3446  // If oversubscribed or have waited a bit, yield.
3447  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3448  }
3449 }
3450 
3451 // __kmp_task_team_setup: Create a task_team for the current team, but use
3452 // an already created, unused one if it already exists.
3453 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3454  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3455 
3456  // If this task_team hasn't been created yet, allocate it. It will be used in
3457  // the region after the next.
3458  // If it exists, it is the current task team and shouldn't be touched yet as
3459  // it may still be in use.
3460  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3461  (always || team->t.t_nproc > 1)) {
3462  team->t.t_task_team[this_thr->th.th_task_state] =
3463  __kmp_allocate_task_team(this_thr, team);
3464  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
3465  "for team %d at parity=%d\n",
3466  __kmp_gtid_from_thread(this_thr),
3467  team->t.t_task_team[this_thr->th.th_task_state],
3468  ((team != NULL) ? team->t.t_id : -1),
3469  this_thr->th.th_task_state));
3470  }
3471 
3472  // After threads exit the release, they will call sync, and then point to this
3473  // other task_team; make sure it is allocated and properly initialized. As
3474  // threads spin in the barrier release phase, they will continue to use the
3475  // previous task_team struct(above), until they receive the signal to stop
3476  // checking for tasks (they can't safely reference the kmp_team_t struct,
3477  // which could be reallocated by the master thread). No task teams are formed
3478  // for serialized teams.
3479  if (team->t.t_nproc > 1) {
3480  int other_team = 1 - this_thr->th.th_task_state;
3481  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3482  team->t.t_task_team[other_team] =
3483  __kmp_allocate_task_team(this_thr, team);
3484  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
3485  "task_team %p for team %d at parity=%d\n",
3486  __kmp_gtid_from_thread(this_thr),
3487  team->t.t_task_team[other_team],
3488  ((team != NULL) ? team->t.t_id : -1), other_team));
3489  } else { // Leave the old task team struct in place for the upcoming region;
3490  // adjust as needed
3491  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3492  if (!task_team->tt.tt_active ||
3493  team->t.t_nproc != task_team->tt.tt_nproc) {
3494  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3495  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3496  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3497  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3498  team->t.t_nproc);
3499  TCW_4(task_team->tt.tt_active, TRUE);
3500  }
3501  // if team size has changed, the first thread to enable tasking will
3502  // realloc threads_data if necessary
3503  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
3504  "%p for team %d at parity=%d\n",
3505  __kmp_gtid_from_thread(this_thr),
3506  team->t.t_task_team[other_team],
3507  ((team != NULL) ? team->t.t_id : -1), other_team));
3508  }
3509  }
3510 }
3511 
3512 // __kmp_task_team_sync: Propagation of task team data from team to threads
3513 // which happens just after the release phase of a team barrier. This may be
3514 // called by any thread, but only for teams with # threads > 1.
3515 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3516  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3517 
3518  // Toggle the th_task_state field, to switch which task_team this thread
3519  // refers to
3520  this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3521  // It is now safe to propagate the task team pointer from the team struct to
3522  // the current thread.
3523  TCW_PTR(this_thr->th.th_task_team,
3524  team->t.t_task_team[this_thr->th.th_task_state]);
3525  KA_TRACE(20,
3526  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3527  "%p from Team #%d (parity=%d)\n",
3528  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3529  ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3530 }
3531 
3532 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
3533 // barrier gather phase. Only called by master thread if #threads in team > 1 or
3534 // if proxy tasks were created.
3535 //
3536 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3537 // by passing in 0 optionally as the last argument. When wait is zero, master
3538 // thread does not wait for unfinished_threads to reach 0.
3539 void __kmp_task_team_wait(
3540  kmp_info_t *this_thr,
3541  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3542  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3543 
3544  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3545  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3546 
3547  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3548  if (wait) {
3549  KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
3550  "(for unfinished_threads to reach 0) on task_team = %p\n",
3551  __kmp_gtid_from_thread(this_thr), task_team));
3552  // Worker threads may have dropped through to release phase, but could
3553  // still be executing tasks. Wait here for tasks to complete. To avoid
3554  // memory contention, only master thread checks termination condition.
3555  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3556  &task_team->tt.tt_unfinished_threads),
3557  0U);
3558  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3559  }
3560  // Deactivate the old task team, so that the worker threads will stop
3561  // referencing it while spinning.
3562  KA_TRACE(
3563  20,
3564  ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3565  "setting active to false, setting local and team's pointer to NULL\n",
3566  __kmp_gtid_from_thread(this_thr), task_team));
3567  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3568  task_team->tt.tt_found_proxy_tasks == TRUE);
3569  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3570  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3571  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3572  KMP_MB();
3573 
3574  TCW_PTR(this_thr->th.th_task_team, NULL);
3575  }
3576 }
3577 
3578 // __kmp_tasking_barrier:
3579 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
3580 // Internal function to execute all tasks prior to a regular barrier or a join
3581 // barrier. It is a full barrier itself, which unfortunately turns regular
3582 // barriers into double barriers and join barriers into 1 1/2 barriers.
3583 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3584  std::atomic<kmp_uint32> *spin = RCAST(
3585  std::atomic<kmp_uint32> *,
3586  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3587  int flag = FALSE;
3588  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3589 
3590 #if USE_ITT_BUILD
3591  KMP_FSYNC_SPIN_INIT(spin, NULL);
3592 #endif /* USE_ITT_BUILD */
3593  kmp_flag_32 spin_flag(spin, 0U);
3594  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3595  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3596 #if USE_ITT_BUILD
3597  // TODO: What about itt_sync_obj??
3598  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3599 #endif /* USE_ITT_BUILD */
3600 
3601  if (TCR_4(__kmp_global.g.g_done)) {
3602  if (__kmp_global.g.g_abort)
3603  __kmp_abort_thread();
3604  break;
3605  }
3606  KMP_YIELD(TRUE);
3607  }
3608 #if USE_ITT_BUILD
3609  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3610 #endif /* USE_ITT_BUILD */
3611 }
3612 
3613 // __kmp_give_task puts a task into a given thread queue if:
3614 // - the queue for that thread was created
3615 // - there's space in that queue
3616 // Because of this, __kmp_push_task needs to check if there's space after
3617 // getting the lock
3618 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3619  kmp_int32 pass) {
3620  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3621  kmp_task_team_t *task_team = taskdata->td_task_team;
3622 
3623  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3624  taskdata, tid));
3625 
3626  // If task_team is NULL something went really bad...
3627  KMP_DEBUG_ASSERT(task_team != NULL);
3628 
3629  bool result = false;
3630  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3631 
3632  if (thread_data->td.td_deque == NULL) {
3633  // There's no queue in this thread, go find another one
3634  // We're guaranteed that at least one thread has a queue
3635  KA_TRACE(30,
3636  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3637  tid, taskdata));
3638  return result;
3639  }
3640 
3641  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3642  TASK_DEQUE_SIZE(thread_data->td)) {
3643  KA_TRACE(
3644  30,
3645  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3646  taskdata, tid));
3647 
3648  // if this deque is bigger than the pass ratio give a chance to another
3649  // thread
3650  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3651  return result;
3652 
3653  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3654  __kmp_realloc_task_deque(thread, thread_data);
3655 
3656  } else {
3657 
3658  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3659 
3660  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3661  TASK_DEQUE_SIZE(thread_data->td)) {
3662  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3663  "thread %d.\n",
3664  taskdata, tid));
3665 
3666  // if this deque is bigger than the pass ratio give a chance to another
3667  // thread
3668  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3669  goto release_and_exit;
3670 
3671  __kmp_realloc_task_deque(thread, thread_data);
3672  }
3673  }
3674 
3675  // lock is held here, and there is space in the deque
3676 
3677  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3678  // Wrap index.
3679  thread_data->td.td_deque_tail =
3680  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3681  TCW_4(thread_data->td.td_deque_ntasks,
3682  TCR_4(thread_data->td.td_deque_ntasks) + 1);
3683 
3684  result = true;
3685  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3686  taskdata, tid));
3687 
3688 release_and_exit:
3689  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3690 
3691  return result;
3692 }
3693 
3694 /* The finish of the proxy tasks is divided in two pieces:
3695  - the top half is the one that can be done from a thread outside the team
3696  - the bottom half must be run from a thread within the team
3697 
3698  In order to run the bottom half the task gets queued back into one of the
3699  threads of the team. Once the td_incomplete_child_task counter of the parent
3700  is decremented the threads can leave the barriers. So, the bottom half needs
3701  to be queued before the counter is decremented. The top half is therefore
3702  divided in two parts:
3703  - things that can be run before queuing the bottom half
3704  - things that must be run after queuing the bottom half
3705 
3706  This creates a second race as the bottom half can free the task before the
3707  second top half is executed. To avoid this we use the
3708  td_incomplete_child_task of the proxy task to synchronize the top and bottom
3709  half. */
3710 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3711  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3712  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3713  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3714  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3715 
3716  taskdata->td_flags.complete = 1; // mark the task as completed
3717 
3718  if (taskdata->td_taskgroup)
3719  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3720 
3721  // Create an imaginary children for this task so the bottom half cannot
3722  // release the task before we have completed the second top half
3723  KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3724 }
3725 
3726 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3727  kmp_int32 children = 0;
3728 
3729  // Predecrement simulated by "- 1" calculation
3730  children =
3731  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3732  KMP_DEBUG_ASSERT(children >= 0);
3733 
3734  // Remove the imaginary children
3735  KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3736 }
3737 
3738 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3739  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3740  kmp_info_t *thread = __kmp_threads[gtid];
3741 
3742  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3743  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3744  1); // top half must run before bottom half
3745 
3746  // We need to wait to make sure the top half is finished
3747  // Spinning here should be ok as this should happen quickly
3748  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3749  ;
3750 
3751  __kmp_release_deps(gtid, taskdata);
3752  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3753 }
3754 
3763 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3764  KMP_DEBUG_ASSERT(ptask != NULL);
3765  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3766  KA_TRACE(
3767  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3768  gtid, taskdata));
3769 
3770  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3771 
3772  __kmp_first_top_half_finish_proxy(taskdata);
3773  __kmp_second_top_half_finish_proxy(taskdata);
3774  __kmp_bottom_half_finish_proxy(gtid, ptask);
3775 
3776  KA_TRACE(10,
3777  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3778  gtid, taskdata));
3779 }
3780 
3788 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3789  KMP_DEBUG_ASSERT(ptask != NULL);
3790  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3791 
3792  KA_TRACE(
3793  10,
3794  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3795  taskdata));
3796 
3797  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3798 
3799  __kmp_first_top_half_finish_proxy(taskdata);
3800 
3801  // Enqueue task to complete bottom half completion from a thread within the
3802  // corresponding team
3803  kmp_team_t *team = taskdata->td_team;
3804  kmp_int32 nthreads = team->t.t_nproc;
3805  kmp_info_t *thread;
3806 
3807  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3808  // but we cannot use __kmp_get_random here
3809  kmp_int32 start_k = 0;
3810  kmp_int32 pass = 1;
3811  kmp_int32 k = start_k;
3812 
3813  do {
3814  // For now we're just linearly trying to find a thread
3815  thread = team->t.t_threads[k];
3816  k = (k + 1) % nthreads;
3817 
3818  // we did a full pass through all the threads
3819  if (k == start_k)
3820  pass = pass << 1;
3821 
3822  } while (!__kmp_give_task(thread, k, ptask, pass));
3823 
3824  __kmp_second_top_half_finish_proxy(taskdata);
3825 
3826  KA_TRACE(
3827  10,
3828  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3829  taskdata));
3830 }
3831 
3832 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
3833  kmp_task_t *task) {
3834  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
3835  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
3836  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
3837  td->td_allow_completion_event.ed.task = task;
3838  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
3839  }
3840  return &td->td_allow_completion_event;
3841 }
3842 
3843 void __kmp_fulfill_event(kmp_event_t *event) {
3844  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
3845  kmp_task_t *ptask = event->ed.task;
3846  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3847  bool detached = false;
3848  int gtid = __kmp_get_gtid();
3849 
3850  if (taskdata->td_flags.proxy == TASK_PROXY) {
3851  // The associated task code completed before this call and detached.
3852  detached = true;
3853  event->type = KMP_EVENT_UNINITIALIZED;
3854  } else {
3855  // The associated task has not completed but could be completing at this
3856  // point.
3857  // We need to take the lock to avoid races
3858  __kmp_acquire_tas_lock(&event->lock, gtid);
3859  if (taskdata->td_flags.proxy == TASK_PROXY)
3860  detached = true;
3861  event->type = KMP_EVENT_UNINITIALIZED;
3862  __kmp_release_tas_lock(&event->lock, gtid);
3863  }
3864 
3865  if (detached) {
3866  // If the task detached complete the proxy task
3867  if (gtid >= 0) {
3868  kmp_team_t *team = taskdata->td_team;
3869  kmp_info_t *thread = __kmp_get_thread();
3870  if (thread->th.th_team == team) {
3871  __kmpc_proxy_task_completed(gtid, ptask);
3872  return;
3873  }
3874  }
3875 
3876  // fallback
3878  }
3879  }
3880 }
3881 
3882 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3883 // for taskloop
3884 //
3885 // thread: allocating thread
3886 // task_src: pointer to source task to be duplicated
3887 // returns: a pointer to the allocated kmp_task_t structure (task).
3888 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3889  kmp_task_t *task;
3890  kmp_taskdata_t *taskdata;
3891  kmp_taskdata_t *taskdata_src;
3892  kmp_taskdata_t *parent_task = thread->th.th_current_task;
3893  size_t shareds_offset;
3894  size_t task_size;
3895 
3896  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3897  task_src));
3898  taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3899  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3900  TASK_FULL); // it should not be proxy task
3901  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3902  task_size = taskdata_src->td_size_alloc;
3903 
3904  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3905  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3906  task_size));
3907 #if USE_FAST_MEMORY
3908  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3909 #else
3910  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3911 #endif /* USE_FAST_MEMORY */
3912  KMP_MEMCPY(taskdata, taskdata_src, task_size);
3913 
3914  task = KMP_TASKDATA_TO_TASK(taskdata);
3915 
3916  // Initialize new task (only specific fields not affected by memcpy)
3917  taskdata->td_task_id = KMP_GEN_TASK_ID();
3918  if (task->shareds != NULL) { // need setup shareds pointer
3919  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3920  task->shareds = &((char *)taskdata)[shareds_offset];
3921  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3922  0);
3923  }
3924  taskdata->td_alloc_thread = thread;
3925  taskdata->td_parent = parent_task;
3926  taskdata->td_taskgroup =
3927  parent_task
3928  ->td_taskgroup; // task inherits the taskgroup from the parent task
3929 
3930  // Only need to keep track of child task counts if team parallel and tasking
3931  // not serialized
3932  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3933  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
3934  if (parent_task->td_taskgroup)
3935  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
3936  // Only need to keep track of allocated child tasks for explicit tasks since
3937  // implicit not deallocated
3938  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3939  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
3940  }
3941 
3942  KA_TRACE(20,
3943  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3944  thread, taskdata, taskdata->td_parent));
3945 #if OMPT_SUPPORT
3946  if (UNLIKELY(ompt_enabled.enabled))
3947  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
3948 #endif
3949  return task;
3950 }
3951 
3952 // Routine optionally generated by the compiler for setting the lastprivate flag
3953 // and calling needed constructors for private/firstprivate objects
3954 // (used to form taskloop tasks from pattern task)
3955 // Parameters: dest task, src task, lastprivate flag.
3956 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3957 
3958 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
3959 
3960 // class to encapsulate manipulating loop bounds in a taskloop task.
3961 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
3962 // the loop bound variables.
3963 class kmp_taskloop_bounds_t {
3964  kmp_task_t *task;
3965  const kmp_taskdata_t *taskdata;
3966  size_t lower_offset;
3967  size_t upper_offset;
3968 
3969 public:
3970  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
3971  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
3972  lower_offset((char *)lb - (char *)task),
3973  upper_offset((char *)ub - (char *)task) {
3974  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
3975  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
3976  }
3977  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
3978  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
3979  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
3980  size_t get_lower_offset() const { return lower_offset; }
3981  size_t get_upper_offset() const { return upper_offset; }
3982  kmp_uint64 get_lb() const {
3983  kmp_int64 retval;
3984 #if defined(KMP_GOMP_COMPAT)
3985  // Intel task just returns the lower bound normally
3986  if (!taskdata->td_flags.native) {
3987  retval = *(kmp_int64 *)((char *)task + lower_offset);
3988  } else {
3989  // GOMP task has to take into account the sizeof(long)
3990  if (taskdata->td_size_loop_bounds == 4) {
3991  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
3992  retval = (kmp_int64)*lb;
3993  } else {
3994  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
3995  retval = (kmp_int64)*lb;
3996  }
3997  }
3998 #else
3999  retval = *(kmp_int64 *)((char *)task + lower_offset);
4000 #endif // defined(KMP_GOMP_COMPAT)
4001  return retval;
4002  }
4003  kmp_uint64 get_ub() const {
4004  kmp_int64 retval;
4005 #if defined(KMP_GOMP_COMPAT)
4006  // Intel task just returns the upper bound normally
4007  if (!taskdata->td_flags.native) {
4008  retval = *(kmp_int64 *)((char *)task + upper_offset);
4009  } else {
4010  // GOMP task has to take into account the sizeof(long)
4011  if (taskdata->td_size_loop_bounds == 4) {
4012  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4013  retval = (kmp_int64)*ub;
4014  } else {
4015  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4016  retval = (kmp_int64)*ub;
4017  }
4018  }
4019 #else
4020  retval = *(kmp_int64 *)((char *)task + upper_offset);
4021 #endif // defined(KMP_GOMP_COMPAT)
4022  return retval;
4023  }
4024  void set_lb(kmp_uint64 lb) {
4025 #if defined(KMP_GOMP_COMPAT)
4026  // Intel task just sets the lower bound normally
4027  if (!taskdata->td_flags.native) {
4028  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4029  } else {
4030  // GOMP task has to take into account the sizeof(long)
4031  if (taskdata->td_size_loop_bounds == 4) {
4032  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4033  *lower = (kmp_uint32)lb;
4034  } else {
4035  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4036  *lower = (kmp_uint64)lb;
4037  }
4038  }
4039 #else
4040  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4041 #endif // defined(KMP_GOMP_COMPAT)
4042  }
4043  void set_ub(kmp_uint64 ub) {
4044 #if defined(KMP_GOMP_COMPAT)
4045  // Intel task just sets the upper bound normally
4046  if (!taskdata->td_flags.native) {
4047  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4048  } else {
4049  // GOMP task has to take into account the sizeof(long)
4050  if (taskdata->td_size_loop_bounds == 4) {
4051  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4052  *upper = (kmp_uint32)ub;
4053  } else {
4054  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4055  *upper = (kmp_uint64)ub;
4056  }
4057  }
4058 #else
4059  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4060 #endif // defined(KMP_GOMP_COMPAT)
4061  }
4062 };
4063 
4064 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4065 //
4066 // loc Source location information
4067 // gtid Global thread ID
4068 // task Pattern task, exposes the loop iteration range
4069 // lb Pointer to loop lower bound in task structure
4070 // ub Pointer to loop upper bound in task structure
4071 // st Loop stride
4072 // ub_glob Global upper bound (used for lastprivate check)
4073 // num_tasks Number of tasks to execute
4074 // grainsize Number of loop iterations per task
4075 // extras Number of chunks with grainsize+1 iterations
4076 // tc Iterations count
4077 // task_dup Tasks duplication routine
4078 // codeptr_ra Return address for OMPT events
4079 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4080  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4081  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4082  kmp_uint64 grainsize, kmp_uint64 extras,
4083  kmp_uint64 tc,
4084 #if OMPT_SUPPORT
4085  void *codeptr_ra,
4086 #endif
4087  void *task_dup) {
4088  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4089  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4090  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4091  // compiler provides global bounds here
4092  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4093  kmp_uint64 lower = task_bounds.get_lb();
4094  kmp_uint64 upper = task_bounds.get_ub();
4095  kmp_uint64 i;
4096  kmp_info_t *thread = __kmp_threads[gtid];
4097  kmp_taskdata_t *current_task = thread->th.th_current_task;
4098  kmp_task_t *next_task;
4099  kmp_int32 lastpriv = 0;
4100 
4101  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4102  KMP_DEBUG_ASSERT(num_tasks > extras);
4103  KMP_DEBUG_ASSERT(num_tasks > 0);
4104  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4105  "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4106  gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
4107  task_dup));
4108 
4109  // Launch num_tasks tasks, assign grainsize iterations each task
4110  for (i = 0; i < num_tasks; ++i) {
4111  kmp_uint64 chunk_minus_1;
4112  if (extras == 0) {
4113  chunk_minus_1 = grainsize - 1;
4114  } else {
4115  chunk_minus_1 = grainsize;
4116  --extras; // first extras iterations get bigger chunk (grainsize+1)
4117  }
4118  upper = lower + st * chunk_minus_1;
4119  if (i == num_tasks - 1) {
4120  // schedule the last task, set lastprivate flag if needed
4121  if (st == 1) { // most common case
4122  KMP_DEBUG_ASSERT(upper == *ub);
4123  if (upper == ub_glob)
4124  lastpriv = 1;
4125  } else if (st > 0) { // positive loop stride
4126  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4127  if ((kmp_uint64)st > ub_glob - upper)
4128  lastpriv = 1;
4129  } else { // negative loop stride
4130  KMP_DEBUG_ASSERT(upper + st < *ub);
4131  if (upper - ub_glob < (kmp_uint64)(-st))
4132  lastpriv = 1;
4133  }
4134  }
4135  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4136  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4137  kmp_taskloop_bounds_t next_task_bounds =
4138  kmp_taskloop_bounds_t(next_task, task_bounds);
4139 
4140  // adjust task-specific bounds
4141  next_task_bounds.set_lb(lower);
4142  if (next_taskdata->td_flags.native) {
4143  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4144  } else {
4145  next_task_bounds.set_ub(upper);
4146  }
4147  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4148  // etc.
4149  ptask_dup(next_task, task, lastpriv);
4150  KA_TRACE(40,
4151  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4152  "upper %lld stride %lld, (offsets %p %p)\n",
4153  gtid, i, next_task, lower, upper, st,
4154  next_task_bounds.get_lower_offset(),
4155  next_task_bounds.get_upper_offset()));
4156 #if OMPT_SUPPORT
4157  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4158  codeptr_ra); // schedule new task
4159 #else
4160  __kmp_omp_task(gtid, next_task, true); // schedule new task
4161 #endif
4162  lower = upper + st; // adjust lower bound for the next iteration
4163  }
4164  // free the pattern task and exit
4165  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4166  // do not execute the pattern task, just do internal bookkeeping
4167  __kmp_task_finish<false>(gtid, task, current_task);
4168 }
4169 
4170 // Structure to keep taskloop parameters for auxiliary task
4171 // kept in the shareds of the task structure.
4172 typedef struct __taskloop_params {
4173  kmp_task_t *task;
4174  kmp_uint64 *lb;
4175  kmp_uint64 *ub;
4176  void *task_dup;
4177  kmp_int64 st;
4178  kmp_uint64 ub_glob;
4179  kmp_uint64 num_tasks;
4180  kmp_uint64 grainsize;
4181  kmp_uint64 extras;
4182  kmp_uint64 tc;
4183  kmp_uint64 num_t_min;
4184 #if OMPT_SUPPORT
4185  void *codeptr_ra;
4186 #endif
4187 } __taskloop_params_t;
4188 
4189 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4190  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4191  kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
4192 #if OMPT_SUPPORT
4193  void *,
4194 #endif
4195  void *);
4196 
4197 // Execute part of the taskloop submitted as a task.
4198 int __kmp_taskloop_task(int gtid, void *ptask) {
4199  __taskloop_params_t *p =
4200  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4201  kmp_task_t *task = p->task;
4202  kmp_uint64 *lb = p->lb;
4203  kmp_uint64 *ub = p->ub;
4204  void *task_dup = p->task_dup;
4205  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4206  kmp_int64 st = p->st;
4207  kmp_uint64 ub_glob = p->ub_glob;
4208  kmp_uint64 num_tasks = p->num_tasks;
4209  kmp_uint64 grainsize = p->grainsize;
4210  kmp_uint64 extras = p->extras;
4211  kmp_uint64 tc = p->tc;
4212  kmp_uint64 num_t_min = p->num_t_min;
4213 #if OMPT_SUPPORT
4214  void *codeptr_ra = p->codeptr_ra;
4215 #endif
4216 #if KMP_DEBUG
4217  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4218  KMP_DEBUG_ASSERT(task != NULL);
4219  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4220  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4221  gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4222  task_dup));
4223 #endif
4224  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4225  if (num_tasks > num_t_min)
4226  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4227  grainsize, extras, tc, num_t_min,
4228 #if OMPT_SUPPORT
4229  codeptr_ra,
4230 #endif
4231  task_dup);
4232  else
4233  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4234  grainsize, extras, tc,
4235 #if OMPT_SUPPORT
4236  codeptr_ra,
4237 #endif
4238  task_dup);
4239 
4240  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4241  return 0;
4242 }
4243 
4244 // Schedule part of the taskloop as a task,
4245 // execute the rest of the taskloop.
4246 //
4247 // loc Source location information
4248 // gtid Global thread ID
4249 // task Pattern task, exposes the loop iteration range
4250 // lb Pointer to loop lower bound in task structure
4251 // ub Pointer to loop upper bound in task structure
4252 // st Loop stride
4253 // ub_glob Global upper bound (used for lastprivate check)
4254 // num_tasks Number of tasks to execute
4255 // grainsize Number of loop iterations per task
4256 // extras Number of chunks with grainsize+1 iterations
4257 // tc Iterations count
4258 // num_t_min Threashold to launch tasks recursively
4259 // task_dup Tasks duplication routine
4260 // codeptr_ra Return address for OMPT events
4261 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4262  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4263  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4264  kmp_uint64 grainsize, kmp_uint64 extras,
4265  kmp_uint64 tc, kmp_uint64 num_t_min,
4266 #if OMPT_SUPPORT
4267  void *codeptr_ra,
4268 #endif
4269  void *task_dup) {
4270 #if KMP_DEBUG
4271  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4272  KMP_DEBUG_ASSERT(task != NULL);
4273  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4274  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4275  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4276  gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4277  task_dup));
4278 #endif
4279  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4280  kmp_uint64 lower = *lb;
4281  kmp_info_t *thread = __kmp_threads[gtid];
4282  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4283  kmp_task_t *next_task;
4284  size_t lower_offset =
4285  (char *)lb - (char *)task; // remember offset of lb in the task structure
4286  size_t upper_offset =
4287  (char *)ub - (char *)task; // remember offset of ub in the task structure
4288 
4289  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4290  KMP_DEBUG_ASSERT(num_tasks > extras);
4291  KMP_DEBUG_ASSERT(num_tasks > 0);
4292 
4293  // split the loop in two halves
4294  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4295  kmp_uint64 gr_size0 = grainsize;
4296  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4297  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4298  if (n_tsk0 <= extras) {
4299  gr_size0++; // integrate extras into grainsize
4300  ext0 = 0; // no extra iters in 1st half
4301  ext1 = extras - n_tsk0; // remaining extras
4302  tc0 = gr_size0 * n_tsk0;
4303  tc1 = tc - tc0;
4304  } else { // n_tsk0 > extras
4305  ext1 = 0; // no extra iters in 2nd half
4306  ext0 = extras;
4307  tc1 = grainsize * n_tsk1;
4308  tc0 = tc - tc1;
4309  }
4310  ub0 = lower + st * (tc0 - 1);
4311  lb1 = ub0 + st;
4312 
4313  // create pattern task for 2nd half of the loop
4314  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4315  // adjust lower bound (upper bound is not changed) for the 2nd half
4316  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4317  if (ptask_dup != NULL) // construct firstprivates, etc.
4318  ptask_dup(next_task, task, 0);
4319  *ub = ub0; // adjust upper bound for the 1st half
4320 
4321  // create auxiliary task for 2nd half of the loop
4322  kmp_task_t *new_task =
4323  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4324  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4325  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4326  p->task = next_task;
4327  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4328  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4329  p->task_dup = task_dup;
4330  p->st = st;
4331  p->ub_glob = ub_glob;
4332  p->num_tasks = n_tsk1;
4333  p->grainsize = grainsize;
4334  p->extras = ext1;
4335  p->tc = tc1;
4336  p->num_t_min = num_t_min;
4337 #if OMPT_SUPPORT
4338  p->codeptr_ra = codeptr_ra;
4339 #endif
4340 
4341 #if OMPT_SUPPORT
4342  // schedule new task with correct return address for OMPT events
4343  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4344 #else
4345  __kmp_omp_task(gtid, new_task, true); // schedule new task
4346 #endif
4347 
4348  // execute the 1st half of current subrange
4349  if (n_tsk0 > num_t_min)
4350  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4351  ext0, tc0, num_t_min,
4352 #if OMPT_SUPPORT
4353  codeptr_ra,
4354 #endif
4355  task_dup);
4356  else
4357  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4358  gr_size0, ext0, tc0,
4359 #if OMPT_SUPPORT
4360  codeptr_ra,
4361 #endif
4362  task_dup);
4363 
4364  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4365 }
4366 
4383 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4384  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4385  int sched, kmp_uint64 grainsize, void *task_dup) {
4386  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4387  KMP_DEBUG_ASSERT(task != NULL);
4388 
4389  if (nogroup == 0) {
4390 #if OMPT_SUPPORT && OMPT_OPTIONAL
4391  OMPT_STORE_RETURN_ADDRESS(gtid);
4392 #endif
4393  __kmpc_taskgroup(loc, gtid);
4394  }
4395 
4396  // =========================================================================
4397  // calculate loop parameters
4398  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4399  kmp_uint64 tc;
4400  // compiler provides global bounds here
4401  kmp_uint64 lower = task_bounds.get_lb();
4402  kmp_uint64 upper = task_bounds.get_ub();
4403  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4404  kmp_uint64 num_tasks = 0, extras = 0;
4405  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4406  kmp_info_t *thread = __kmp_threads[gtid];
4407  kmp_taskdata_t *current_task = thread->th.th_current_task;
4408 
4409  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4410  "grain %llu(%d), dup %p\n",
4411  gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4412 
4413  // compute trip count
4414  if (st == 1) { // most common case
4415  tc = upper - lower + 1;
4416  } else if (st < 0) {
4417  tc = (lower - upper) / (-st) + 1;
4418  } else { // st > 0
4419  tc = (upper - lower) / st + 1;
4420  }
4421  if (tc == 0) {
4422  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4423  // free the pattern task and exit
4424  __kmp_task_start(gtid, task, current_task);
4425  // do not execute anything for zero-trip loop
4426  __kmp_task_finish<false>(gtid, task, current_task);
4427  return;
4428  }
4429 
4430 #if OMPT_SUPPORT && OMPT_OPTIONAL
4431  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4432  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4433  if (ompt_enabled.ompt_callback_work) {
4434  ompt_callbacks.ompt_callback(ompt_callback_work)(
4435  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4436  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4437  }
4438 #endif
4439 
4440  if (num_tasks_min == 0)
4441  // TODO: can we choose better default heuristic?
4442  num_tasks_min =
4443  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4444 
4445  // compute num_tasks/grainsize based on the input provided
4446  switch (sched) {
4447  case 0: // no schedule clause specified, we can choose the default
4448  // let's try to schedule (team_size*10) tasks
4449  grainsize = thread->th.th_team_nproc * 10;
4450  KMP_FALLTHROUGH();
4451  case 2: // num_tasks provided
4452  if (grainsize > tc) {
4453  num_tasks = tc; // too big num_tasks requested, adjust values
4454  grainsize = 1;
4455  extras = 0;
4456  } else {
4457  num_tasks = grainsize;
4458  grainsize = tc / num_tasks;
4459  extras = tc % num_tasks;
4460  }
4461  break;
4462  case 1: // grainsize provided
4463  if (grainsize > tc) {
4464  num_tasks = 1; // too big grainsize requested, adjust values
4465  grainsize = tc;
4466  extras = 0;
4467  } else {
4468  num_tasks = tc / grainsize;
4469  // adjust grainsize for balanced distribution of iterations
4470  grainsize = tc / num_tasks;
4471  extras = tc % num_tasks;
4472  }
4473  break;
4474  default:
4475  KMP_ASSERT2(0, "unknown scheduling of taskloop");
4476  }
4477  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4478  KMP_DEBUG_ASSERT(num_tasks > extras);
4479  KMP_DEBUG_ASSERT(num_tasks > 0);
4480  // =========================================================================
4481 
4482  // check if clause value first
4483  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4484  if (if_val == 0) { // if(0) specified, mark task as serial
4485  taskdata->td_flags.task_serial = 1;
4486  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4487  // always start serial tasks linearly
4488  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4489  grainsize, extras, tc,
4490 #if OMPT_SUPPORT
4491  OMPT_GET_RETURN_ADDRESS(0),
4492 #endif
4493  task_dup);
4494  // !taskdata->td_flags.native => currently force linear spawning of tasks
4495  // for GOMP_taskloop
4496  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4497  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4498  "(%lld), grain %llu, extras %llu\n",
4499  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4500  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4501  grainsize, extras, tc, num_tasks_min,
4502 #if OMPT_SUPPORT
4503  OMPT_GET_RETURN_ADDRESS(0),
4504 #endif
4505  task_dup);
4506  } else {
4507  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4508  "(%lld), grain %llu, extras %llu\n",
4509  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4510  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4511  grainsize, extras, tc,
4512 #if OMPT_SUPPORT
4513  OMPT_GET_RETURN_ADDRESS(0),
4514 #endif
4515  task_dup);
4516  }
4517 
4518 #if OMPT_SUPPORT && OMPT_OPTIONAL
4519  if (ompt_enabled.ompt_callback_work) {
4520  ompt_callbacks.ompt_callback(ompt_callback_work)(
4521  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4522  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4523  }
4524 #endif
4525 
4526  if (nogroup == 0) {
4527 #if OMPT_SUPPORT && OMPT_OPTIONAL
4528  OMPT_STORE_RETURN_ADDRESS(gtid);
4529 #endif
4530  __kmpc_end_taskgroup(loc, gtid);
4531  }
4532  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4533 }
kmp_taskred_data::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2024
kmp_taskred_input::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2041
kmp_taskred_data::reduce_orig
void * reduce_orig
Definition: kmp_tasking.cpp:2032
__kmpc_proxy_task_completed
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
Definition: kmp_tasking.cpp:3763
kmp_taskred_data_t
struct kmp_taskred_data kmp_taskred_data_t
kmp_taskred_data::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2025
kmp_taskred_data
Definition: kmp_tasking.cpp:2022
kmp_task_red_input::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2014
kmp_taskred_data::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2023
__kmpc_taskred_init
void * __kmpc_taskred_init(int gtid, int num, void *data)
Definition: kmp_tasking.cpp:2167
__kmpc_task_reduction_init
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
Definition: kmp_tasking.cpp:2151
kmp_task_red_input::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2015
kmp_taskred_data::reduce_pend
void * reduce_pend
Definition: kmp_tasking.cpp:2027
__kmpc_task_reduction_get_th_data
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
Definition: kmp_tasking.cpp:2199
__kmpc_taskloop
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
Definition: kmp_tasking.cpp:4383
kmp_task_red_input_t
struct kmp_task_red_input kmp_task_red_input_t
kmp_taskred_input::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2043
kmp_taskred_data::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2029
kmp_taskred_data::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2030
kmp_task_red_input::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2016
__kmpc_proxy_task_completed_ooo
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
Definition: kmp_tasking.cpp:3788
ident
Definition: kmp.h:222
__kmpc_task_reduction_modifier_fini
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
Definition: kmp_tasking.cpp:2394
kmp_taskred_flags
Definition: kmp_tasking.cpp:2000
kmp_task_red_input::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2011
kmp_taskred_input::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2047
kmp_taskred_input::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2045
kmp_taskred_input::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2046
kmp_taskred_data::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2031
kmp_taskred_input_t
struct kmp_taskred_input kmp_taskred_input_t
kmp_taskred_input
Definition: kmp_tasking.cpp:2040
__kmpc_task_reduction_modifier_init
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition: kmp_tasking.cpp:2360
kmp_task_red_input::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2010
__kmpc_taskred_modifier_init
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition: kmp_tasking.cpp:2380
kmp_taskred_input::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2048
kmp_taskred_flags_t
struct kmp_taskred_flags kmp_taskred_flags_t
kmp_taskred_input::reduce_orig
void * reduce_orig
Definition: kmp_tasking.cpp:2042
kmp_taskred_flags::lazy_priv
unsigned lazy_priv
Definition: kmp_tasking.cpp:2002
kmp_taskred_data::reduce_priv
void * reduce_priv
Definition: kmp_tasking.cpp:2026
kmp_task_red_input::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2013
KMP_COUNT_BLOCK
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:900
kmp_task_red_input
Definition: kmp_tasking.cpp:2009
__kmpc_omp_reg_task_with_affinity
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
Definition: kmp_tasking.cpp:1404