16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
21 #include "ompt-specific.h"
24 #include "tsan_annotations.h"
27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
28 kmp_info_t *this_thr);
29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
30 kmp_thread_data_t *thread_data);
31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
32 kmp_task_team_t *task_team);
33 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
35 #ifdef BUILD_TIED_TASK_STACK
44 static void __kmp_trace_task_stack(kmp_int32 gtid,
45 kmp_thread_data_t *thread_data,
46 int threshold,
char *location) {
47 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
48 kmp_taskdata_t **stack_top = task_stack->ts_top;
49 kmp_int32 entries = task_stack->ts_entries;
50 kmp_taskdata_t *tied_task;
54 (
"__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
55 "first_block = %p, stack_top = %p \n",
56 location, gtid, entries, task_stack->ts_first_block, stack_top));
58 KMP_DEBUG_ASSERT(stack_top != NULL);
59 KMP_DEBUG_ASSERT(entries > 0);
61 while (entries != 0) {
62 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
64 if (entries & TASK_STACK_INDEX_MASK == 0) {
65 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
67 stack_block = stack_block->sb_prev;
68 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
75 tied_task = *stack_top;
77 KMP_DEBUG_ASSERT(tied_task != NULL);
78 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
81 (
"__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
82 "stack_top=%p, tied_task=%p\n",
83 location, gtid, entries, stack_top, tied_task));
85 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
88 (
"__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
98 static void __kmp_init_task_stack(kmp_int32 gtid,
99 kmp_thread_data_t *thread_data) {
100 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
101 kmp_stack_block_t *first_block;
104 first_block = &task_stack->ts_first_block;
105 task_stack->ts_top = (kmp_taskdata_t **)first_block;
106 memset((
void *)first_block,
'\0',
107 TASK_STACK_BLOCK_SIZE *
sizeof(kmp_taskdata_t *));
110 task_stack->ts_entries = TASK_STACK_EMPTY;
111 first_block->sb_next = NULL;
112 first_block->sb_prev = NULL;
119 static void __kmp_free_task_stack(kmp_int32 gtid,
120 kmp_thread_data_t *thread_data) {
121 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
122 kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
124 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
126 while (stack_block != NULL) {
127 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
129 stack_block->sb_next = NULL;
130 stack_block->sb_prev = NULL;
131 if (stack_block != &task_stack->ts_first_block) {
132 __kmp_thread_free(thread,
135 stack_block = next_block;
138 task_stack->ts_entries = 0;
139 task_stack->ts_top = NULL;
148 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
149 kmp_taskdata_t *tied_task) {
151 kmp_thread_data_t *thread_data =
152 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
153 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
155 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
159 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
160 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
163 (
"__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
164 gtid, thread, tied_task));
166 *(task_stack->ts_top) = tied_task;
169 task_stack->ts_top++;
170 task_stack->ts_entries++;
172 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
174 kmp_stack_block_t *stack_block =
175 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
178 if (stack_block->sb_next !=
180 task_stack->ts_top = &stack_block->sb_next->sb_block[0];
182 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
183 thread,
sizeof(kmp_stack_block_t));
185 task_stack->ts_top = &new_block->sb_block[0];
186 stack_block->sb_next = new_block;
187 new_block->sb_prev = stack_block;
188 new_block->sb_next = NULL;
192 (
"__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
193 gtid, tied_task, new_block));
196 KA_TRACE(20, (
"__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
207 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
208 kmp_taskdata_t *ending_task) {
210 kmp_thread_data_t *thread_data =
211 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
212 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
213 kmp_taskdata_t *tied_task;
215 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
220 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
221 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
223 KA_TRACE(20, (
"__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
227 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
228 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
230 stack_block = stack_block->sb_prev;
231 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
235 task_stack->ts_top--;
236 task_stack->ts_entries--;
238 tied_task = *(task_stack->ts_top);
240 KMP_DEBUG_ASSERT(tied_task != NULL);
241 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
242 KMP_DEBUG_ASSERT(tied_task == ending_task);
244 KA_TRACE(20, (
"__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
253 static bool __kmp_task_is_allowed(
int gtid,
const kmp_int32 is_constrained,
254 const kmp_taskdata_t *tasknew,
255 const kmp_taskdata_t *taskcurr) {
256 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
260 kmp_taskdata_t *current = taskcurr->td_last_tied;
261 KMP_DEBUG_ASSERT(current != NULL);
263 if (current->td_flags.tasktype == TASK_EXPLICIT ||
264 current->td_taskwait_thread > 0) {
265 kmp_int32 level = current->td_level;
266 kmp_taskdata_t *parent = tasknew->td_parent;
267 while (parent != current && parent->td_level > level) {
269 parent = parent->td_parent;
270 KMP_DEBUG_ASSERT(parent != NULL);
272 if (parent != current)
277 kmp_depnode_t *node = tasknew->td_depnode;
278 if (node && (node->dn.mtx_num_locks > 0)) {
279 for (
int i = 0; i < node->dn.mtx_num_locks; ++i) {
280 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
281 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
284 for (
int j = i - 1; j >= 0; --j)
285 __kmp_release_lock(node->dn.mtx_locks[j], gtid);
289 node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
298 static void __kmp_realloc_task_deque(kmp_info_t *thread,
299 kmp_thread_data_t *thread_data) {
300 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
301 kmp_int32 new_size = 2 * size;
303 KE_TRACE(10, (
"__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
304 "%d] for thread_data %p\n",
305 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
307 kmp_taskdata_t **new_deque =
308 (kmp_taskdata_t **)__kmp_allocate(new_size *
sizeof(kmp_taskdata_t *));
311 for (i = thread_data->td.td_deque_head, j = 0; j < size;
312 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
313 new_deque[j] = thread_data->td.td_deque[i];
315 __kmp_free(thread_data->td.td_deque);
317 thread_data->td.td_deque_head = 0;
318 thread_data->td.td_deque_tail = size;
319 thread_data->td.td_deque = new_deque;
320 thread_data->td.td_deque_size = new_size;
324 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
325 kmp_info_t *thread = __kmp_threads[gtid];
326 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
327 kmp_task_team_t *task_team = thread->th.th_task_team;
328 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
329 kmp_thread_data_t *thread_data;
332 (
"__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
334 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
337 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
338 KMP_DEBUG_USE_VAR(counter);
341 (
"__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
342 gtid, counter, taskdata));
346 if (taskdata->td_flags.task_serial) {
347 KA_TRACE(20, (
"__kmp_push_task: T#%d team serialized; returning "
348 "TASK_NOT_PUSHED for task %p\n",
350 return TASK_NOT_PUSHED;
355 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
356 if (!KMP_TASKING_ENABLED(task_team)) {
357 __kmp_enable_tasking(task_team, thread);
359 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
360 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
363 thread_data = &task_team->tt.tt_threads_data[tid];
366 if (thread_data->td.td_deque == NULL) {
367 __kmp_alloc_task_deque(thread, thread_data);
372 if (TCR_4(thread_data->td.td_deque_ntasks) >=
373 TASK_DEQUE_SIZE(thread_data->td)) {
374 if (__kmp_enable_task_throttling &&
375 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
376 thread->th.th_current_task)) {
377 KA_TRACE(20, (
"__kmp_push_task: T#%d deque is full; returning "
378 "TASK_NOT_PUSHED for task %p\n",
380 return TASK_NOT_PUSHED;
382 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
385 __kmp_realloc_task_deque(thread, thread_data);
390 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
392 if (TCR_4(thread_data->td.td_deque_ntasks) >=
393 TASK_DEQUE_SIZE(thread_data->td)) {
394 if (__kmp_enable_task_throttling &&
395 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
396 thread->th.th_current_task)) {
397 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
398 KA_TRACE(20, (
"__kmp_push_task: T#%d deque is full on 2nd check; "
399 "returning TASK_NOT_PUSHED for task %p\n",
401 return TASK_NOT_PUSHED;
404 __kmp_realloc_task_deque(thread, thread_data);
409 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
410 TASK_DEQUE_SIZE(thread_data->td));
412 thread_data->td.td_deque[thread_data->td.td_deque_tail] =
415 thread_data->td.td_deque_tail =
416 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
417 TCW_4(thread_data->td.td_deque_ntasks,
418 TCR_4(thread_data->td.td_deque_ntasks) + 1);
420 KA_TRACE(20, (
"__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
421 "task=%p ntasks=%d head=%u tail=%u\n",
422 gtid, taskdata, thread_data->td.td_deque_ntasks,
423 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
425 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
427 return TASK_SUCCESSFULLY_PUSHED;
434 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
435 KF_TRACE(10, (
"__kmp_pop_current_task_from_thread(enter): T#%d "
436 "this_thread=%p, curtask=%p, "
437 "curtask_parent=%p\n",
438 0, this_thr, this_thr->th.th_current_task,
439 this_thr->th.th_current_task->td_parent));
441 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
443 KF_TRACE(10, (
"__kmp_pop_current_task_from_thread(exit): T#%d "
444 "this_thread=%p, curtask=%p, "
445 "curtask_parent=%p\n",
446 0, this_thr, this_thr->th.th_current_task,
447 this_thr->th.th_current_task->td_parent));
456 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
460 KF_TRACE(10, (
"__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
463 tid, this_thr, this_thr->th.th_current_task,
464 team->t.t_implicit_task_taskdata[tid].td_parent));
466 KMP_DEBUG_ASSERT(this_thr != NULL);
469 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
470 team->t.t_implicit_task_taskdata[0].td_parent =
471 this_thr->th.th_current_task;
472 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
475 team->t.t_implicit_task_taskdata[tid].td_parent =
476 team->t.t_implicit_task_taskdata[0].td_parent;
477 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
480 KF_TRACE(10, (
"__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
483 tid, this_thr, this_thr->th.th_current_task,
484 team->t.t_implicit_task_taskdata[tid].td_parent));
492 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
493 kmp_taskdata_t *current_task) {
494 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
495 kmp_info_t *thread = __kmp_threads[gtid];
498 (
"__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
499 gtid, taskdata, current_task));
501 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
506 current_task->td_flags.executing = 0;
509 #ifdef BUILD_TIED_TASK_STACK
510 if (taskdata->td_flags.tiedness == TASK_TIED) {
511 __kmp_push_task_stack(gtid, thread, taskdata);
516 thread->th.th_current_task = taskdata;
518 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
519 taskdata->td_flags.tiedness == TASK_UNTIED);
520 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
521 taskdata->td_flags.tiedness == TASK_UNTIED);
522 taskdata->td_flags.started = 1;
523 taskdata->td_flags.executing = 1;
524 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
525 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
532 KA_TRACE(10, (
"__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
543 static inline void __ompt_task_init(kmp_taskdata_t *task,
int tid) {
545 task->ompt_task_info.task_data.value = 0;
546 task->ompt_task_info.frame.exit_frame = ompt_data_none;
547 task->ompt_task_info.frame.enter_frame = ompt_data_none;
548 task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
549 task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
550 task->ompt_task_info.ndeps = 0;
551 task->ompt_task_info.deps = NULL;
556 static inline void __ompt_task_start(kmp_task_t *task,
557 kmp_taskdata_t *current_task,
559 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
560 ompt_task_status_t status = ompt_task_switch;
561 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
562 status = ompt_task_yield;
563 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
566 if (ompt_enabled.ompt_callback_task_schedule) {
567 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
568 &(current_task->ompt_task_info.task_data), status,
569 &(taskdata->ompt_task_info.task_data));
571 taskdata->ompt_task_info.scheduling_parent = current_task;
577 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
578 ompt_task_status_t status = ompt_task_complete) {
579 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
580 if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
581 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
582 status = ompt_task_cancel;
586 if (ompt_enabled.ompt_callback_task_schedule) {
587 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
588 &(taskdata->ompt_task_info.task_data), status,
589 &((resumed_task ? resumed_task
590 : (taskdata->ompt_task_info.scheduling_parent
591 ? taskdata->ompt_task_info.scheduling_parent
592 : taskdata->td_parent))
593 ->ompt_task_info.task_data));
599 static void __kmpc_omp_task_begin_if0_template(
ident_t *loc_ref, kmp_int32 gtid,
602 void *return_address) {
603 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
604 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
606 KA_TRACE(10, (
"__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
608 gtid, loc_ref, taskdata, current_task));
610 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
613 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
614 KMP_DEBUG_USE_VAR(counter);
615 KA_TRACE(20, (
"__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
616 "incremented for task %p\n",
617 gtid, counter, taskdata));
620 taskdata->td_flags.task_serial =
622 __kmp_task_start(gtid, task, current_task);
626 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
627 current_task->ompt_task_info.frame.enter_frame.ptr =
628 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
629 current_task->ompt_task_info.frame.enter_frame_flags =
630 taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
632 if (ompt_enabled.ompt_callback_task_create) {
633 ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
634 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
635 &(parent_info->task_data), &(parent_info->frame),
636 &(taskdata->ompt_task_info.task_data),
637 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
640 __ompt_task_start(task, current_task, gtid);
642 #endif // OMPT_SUPPORT
644 KA_TRACE(10, (
"__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
650 static void __kmpc_omp_task_begin_if0_ompt(
ident_t *loc_ref, kmp_int32 gtid,
653 void *return_address) {
654 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
657 #endif // OMPT_SUPPORT
665 void __kmpc_omp_task_begin_if0(
ident_t *loc_ref, kmp_int32 gtid,
668 if (UNLIKELY(ompt_enabled.enabled)) {
669 OMPT_STORE_RETURN_ADDRESS(gtid);
670 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
671 OMPT_GET_FRAME_ADDRESS(1),
672 OMPT_LOAD_RETURN_ADDRESS(gtid));
676 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
682 void __kmpc_omp_task_begin(
ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
683 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
687 (
"__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
688 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
690 __kmp_task_start(gtid, task, current_task);
692 KA_TRACE(10, (
"__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
693 loc_ref, KMP_TASK_TO_TASKDATA(task)));
696 #endif // TASK_UNUSED
703 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
704 kmp_info_t *thread) {
705 KA_TRACE(30, (
"__kmp_free_task: T#%d freeing data from task %p\n", gtid,
709 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
710 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
711 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
712 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
713 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
714 taskdata->td_flags.task_serial == 1);
715 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
717 taskdata->td_flags.freed = 1;
718 ANNOTATE_HAPPENS_BEFORE(taskdata);
721 __kmp_fast_free(thread, taskdata);
723 __kmp_thread_free(thread, taskdata);
726 KA_TRACE(20, (
"__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
735 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
736 kmp_taskdata_t *taskdata,
737 kmp_info_t *thread) {
740 kmp_int32 team_serial =
741 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
742 !taskdata->td_flags.proxy;
743 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
745 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
746 KMP_DEBUG_ASSERT(children >= 0);
749 while (children == 0) {
750 kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
752 KA_TRACE(20, (
"__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
753 "and freeing itself\n",
757 __kmp_free_task(gtid, taskdata, thread);
759 taskdata = parent_taskdata;
765 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
766 if (taskdata->td_dephash) {
767 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
768 kmp_tasking_flags_t flags_old = taskdata->td_flags;
769 if (children == 0 && flags_old.complete == 1) {
770 kmp_tasking_flags_t flags_new = flags_old;
771 flags_new.complete = 0;
772 if (KMP_COMPARE_AND_STORE_ACQ32(
773 RCAST(kmp_int32 *, &taskdata->td_flags),
774 *RCAST(kmp_int32 *, &flags_old),
775 *RCAST(kmp_int32 *, &flags_new))) {
776 KA_TRACE(100, (
"__kmp_free_task_and_ancestors: T#%d cleans "
777 "dephash of implicit task %p\n",
780 __kmp_dephash_free_entries(thread, taskdata->td_dephash);
787 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
788 KMP_DEBUG_ASSERT(children >= 0);
792 20, (
"__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
793 "not freeing it yet\n",
794 gtid, taskdata, children));
803 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
804 kmp_taskdata_t *resumed_task) {
805 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
806 kmp_info_t *thread = __kmp_threads[gtid];
807 kmp_task_team_t *task_team =
808 thread->th.th_task_team;
809 kmp_int32 children = 0;
811 KA_TRACE(10, (
"__kmp_task_finish(enter): T#%d finishing task %p and resuming "
813 gtid, taskdata, resumed_task));
815 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
818 #ifdef BUILD_TIED_TASK_STACK
819 if (taskdata->td_flags.tiedness == TASK_TIED) {
820 __kmp_pop_task_stack(gtid, thread, taskdata);
824 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
827 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
830 (
"__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
831 gtid, counter, taskdata));
835 if (resumed_task == NULL) {
836 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
837 resumed_task = taskdata->td_parent;
840 thread->th.th_current_task = resumed_task;
841 resumed_task->td_flags.executing = 1;
842 KA_TRACE(10, (
"__kmp_task_finish(exit): T#%d partially done task %p, "
843 "resuming task %p\n",
844 gtid, taskdata, resumed_task));
850 __ompt_task_finish(task, resumed_task);
854 kmp_depnode_t *node = taskdata->td_depnode;
855 if (node && (node->dn.mtx_num_locks < 0)) {
857 node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
858 for (
int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
859 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
860 __kmp_release_lock(node->dn.mtx_locks[i], gtid);
864 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
866 if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
867 if (taskdata->td_allow_completion_event.type ==
868 KMP_EVENT_ALLOW_COMPLETION) {
870 __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
871 if (taskdata->td_allow_completion_event.type ==
872 KMP_EVENT_ALLOW_COMPLETION) {
873 taskdata->td_flags.proxy = TASK_PROXY;
876 __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
879 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
880 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
883 taskdata->td_flags.complete = 1;
887 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
890 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
891 KMP_DEBUG_ASSERT(children >= 0);
892 if (taskdata->td_taskgroup)
893 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
894 __kmp_release_deps(gtid, taskdata);
895 }
else if (task_team && task_team->tt.tt_found_proxy_tasks) {
898 __kmp_release_deps(gtid, taskdata);
905 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
906 taskdata->td_flags.executing = 0;
909 20, (
"__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
910 gtid, taskdata, children));
918 if (taskdata->td_flags.destructors_thunk) {
919 kmp_routine_entry_t destr_thunk = task->data1.destructors;
920 KMP_ASSERT(destr_thunk);
921 destr_thunk(gtid, task);
927 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
928 taskdata->td_flags.task_serial);
929 if (taskdata->td_flags.task_serial) {
930 if (resumed_task == NULL) {
931 resumed_task = taskdata->td_parent;
935 KMP_DEBUG_ASSERT(resumed_task !=
943 thread->th.th_current_task = resumed_task;
945 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
949 resumed_task->td_flags.executing = 1;
952 10, (
"__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
953 gtid, taskdata, resumed_task));
959 static void __kmpc_omp_task_complete_if0_template(
ident_t *loc_ref,
962 KA_TRACE(10, (
"__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
963 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
965 __kmp_task_finish<ompt>(gtid, task, NULL);
967 KA_TRACE(10, (
"__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
968 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
972 ompt_frame_t *ompt_frame;
973 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
974 ompt_frame->enter_frame = ompt_data_none;
975 ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
984 void __kmpc_omp_task_complete_if0_ompt(
ident_t *loc_ref, kmp_int32 gtid,
986 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
988 #endif // OMPT_SUPPORT
995 void __kmpc_omp_task_complete_if0(
ident_t *loc_ref, kmp_int32 gtid,
998 if (UNLIKELY(ompt_enabled.enabled)) {
999 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1003 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1009 void __kmpc_omp_task_complete(
ident_t *loc_ref, kmp_int32 gtid,
1011 KA_TRACE(10, (
"__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1012 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1014 __kmp_task_finish<false>(gtid, task,
1017 KA_TRACE(10, (
"__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1018 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1021 #endif // TASK_UNUSED
1034 void __kmp_init_implicit_task(
ident_t *loc_ref, kmp_info_t *this_thr,
1035 kmp_team_t *team,
int tid,
int set_curr_task) {
1036 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1040 (
"__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1041 tid, team, task, set_curr_task ?
"TRUE" :
"FALSE"));
1043 task->td_task_id = KMP_GEN_TASK_ID();
1044 task->td_team = team;
1047 task->td_ident = loc_ref;
1048 task->td_taskwait_ident = NULL;
1049 task->td_taskwait_counter = 0;
1050 task->td_taskwait_thread = 0;
1052 task->td_flags.tiedness = TASK_TIED;
1053 task->td_flags.tasktype = TASK_IMPLICIT;
1054 task->td_flags.proxy = TASK_FULL;
1057 task->td_flags.task_serial = 1;
1058 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1059 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1061 task->td_flags.started = 1;
1062 task->td_flags.executing = 1;
1063 task->td_flags.complete = 0;
1064 task->td_flags.freed = 0;
1066 task->td_depnode = NULL;
1067 task->td_last_tied = task;
1068 task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1070 if (set_curr_task) {
1071 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1073 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1074 task->td_taskgroup = NULL;
1075 task->td_dephash = NULL;
1076 __kmp_push_current_task_to_thread(this_thr, team, tid);
1078 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1079 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1083 if (UNLIKELY(ompt_enabled.enabled))
1084 __ompt_task_init(task, tid);
1087 KF_TRACE(10, (
"__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1096 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1097 kmp_taskdata_t *task = thread->th.th_current_task;
1098 if (task->td_dephash) {
1100 task->td_flags.complete = 1;
1101 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1102 kmp_tasking_flags_t flags_old = task->td_flags;
1103 if (children == 0 && flags_old.complete == 1) {
1104 kmp_tasking_flags_t flags_new = flags_old;
1105 flags_new.complete = 0;
1106 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1107 *RCAST(kmp_int32 *, &flags_old),
1108 *RCAST(kmp_int32 *, &flags_new))) {
1109 KA_TRACE(100, (
"__kmp_finish_implicit_task: T#%d cleans "
1110 "dephash of implicit task %p\n",
1111 thread->th.th_info.ds.ds_gtid, task));
1112 __kmp_dephash_free_entries(thread, task->td_dephash);
1122 void __kmp_free_implicit_task(kmp_info_t *thread) {
1123 kmp_taskdata_t *task = thread->th.th_current_task;
1124 if (task && task->td_dephash) {
1125 __kmp_dephash_free(thread, task->td_dephash);
1126 task->td_dephash = NULL;
1132 static size_t __kmp_round_up_to_val(
size_t size,
size_t val) {
1133 if (size & (val - 1)) {
1135 if (size <= KMP_SIZE_T_MAX - val) {
1154 kmp_task_t *__kmp_task_alloc(
ident_t *loc_ref, kmp_int32 gtid,
1155 kmp_tasking_flags_t *flags,
1156 size_t sizeof_kmp_task_t,
size_t sizeof_shareds,
1157 kmp_routine_entry_t task_entry) {
1159 kmp_taskdata_t *taskdata;
1160 kmp_info_t *thread = __kmp_threads[gtid];
1161 kmp_team_t *team = thread->th.th_team;
1162 kmp_taskdata_t *parent_task = thread->th.th_current_task;
1163 size_t shareds_offset;
1165 if (!TCR_4(__kmp_init_middle))
1166 __kmp_middle_initialize();
1168 KA_TRACE(10, (
"__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1169 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1170 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1171 sizeof_shareds, task_entry));
1173 if (parent_task->td_flags.final) {
1174 if (flags->merged_if0) {
1178 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1182 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1188 if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) {
1189 if (flags->proxy == TASK_PROXY) {
1190 flags->tiedness = TASK_UNTIED;
1191 flags->merged_if0 = 1;
1195 if ((thread->th.th_task_team) == NULL) {
1198 KMP_DEBUG_ASSERT(team->t.t_serialized);
1200 (
"T#%d creating task team in __kmp_task_alloc for proxy task\n",
1202 __kmp_task_team_setup(
1205 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1207 kmp_task_team_t *task_team = thread->th.th_task_team;
1210 if (!KMP_TASKING_ENABLED(task_team)) {
1213 (
"T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1214 __kmp_enable_tasking(task_team, thread);
1215 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1216 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1218 if (thread_data->td.td_deque == NULL) {
1219 __kmp_alloc_task_deque(thread, thread_data);
1223 if (task_team->tt.tt_found_proxy_tasks == FALSE)
1224 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1229 shareds_offset =
sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1230 shareds_offset = __kmp_round_up_to_val(shareds_offset,
sizeof(
void *));
1233 KA_TRACE(30, (
"__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1235 KA_TRACE(30, (
"__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1240 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1243 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1246 ANNOTATE_HAPPENS_AFTER(taskdata);
1248 task = KMP_TASKDATA_TO_TASK(taskdata);
1251 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1252 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (
sizeof(
double) - 1)) == 0);
1253 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (
sizeof(
double) - 1)) == 0);
1255 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (
sizeof(_Quad) - 1)) == 0);
1256 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (
sizeof(_Quad) - 1)) == 0);
1258 if (sizeof_shareds > 0) {
1260 task->shareds = &((
char *)taskdata)[shareds_offset];
1262 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (
sizeof(
void *) - 1)) ==
1265 task->shareds = NULL;
1267 task->routine = task_entry;
1270 taskdata->td_task_id = KMP_GEN_TASK_ID();
1271 taskdata->td_team = team;
1272 taskdata->td_alloc_thread = thread;
1273 taskdata->td_parent = parent_task;
1274 taskdata->td_level = parent_task->td_level + 1;
1275 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1276 taskdata->td_ident = loc_ref;
1277 taskdata->td_taskwait_ident = NULL;
1278 taskdata->td_taskwait_counter = 0;
1279 taskdata->td_taskwait_thread = 0;
1280 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1282 if (flags->proxy == TASK_FULL)
1283 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1285 taskdata->td_flags.tiedness = flags->tiedness;
1286 taskdata->td_flags.final = flags->final;
1287 taskdata->td_flags.merged_if0 = flags->merged_if0;
1288 taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1289 taskdata->td_flags.proxy = flags->proxy;
1290 taskdata->td_flags.detachable = flags->detachable;
1291 taskdata->td_task_team = thread->th.th_task_team;
1292 taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1293 taskdata->td_flags.tasktype = TASK_EXPLICIT;
1296 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1299 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1305 taskdata->td_flags.task_serial =
1306 (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1307 taskdata->td_flags.tasking_ser);
1309 taskdata->td_flags.started = 0;
1310 taskdata->td_flags.executing = 0;
1311 taskdata->td_flags.complete = 0;
1312 taskdata->td_flags.freed = 0;
1314 taskdata->td_flags.native = flags->native;
1316 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1318 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1319 taskdata->td_taskgroup =
1320 parent_task->td_taskgroup;
1321 taskdata->td_dephash = NULL;
1322 taskdata->td_depnode = NULL;
1323 if (flags->tiedness == TASK_UNTIED)
1324 taskdata->td_last_tied = NULL;
1326 taskdata->td_last_tied = taskdata;
1327 taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1329 if (UNLIKELY(ompt_enabled.enabled))
1330 __ompt_task_init(taskdata, gtid);
1334 if (flags->proxy == TASK_PROXY ||
1335 flags->detachable == TASK_DETACHABLE ||
1336 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1338 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1339 if (parent_task->td_taskgroup)
1340 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1343 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1344 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1348 KA_TRACE(20, (
"__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1349 gtid, taskdata, taskdata->td_parent));
1350 ANNOTATE_HAPPENS_BEFORE(task);
1355 kmp_task_t *__kmpc_omp_task_alloc(
ident_t *loc_ref, kmp_int32 gtid,
1356 kmp_int32 flags,
size_t sizeof_kmp_task_t,
1357 size_t sizeof_shareds,
1358 kmp_routine_entry_t task_entry) {
1360 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1362 input_flags->native = FALSE;
1365 KA_TRACE(10, (
"__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1366 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1367 gtid, loc_ref, input_flags->tiedness ?
"tied " :
"untied",
1368 input_flags->proxy ?
"proxy" :
"",
1369 input_flags->detachable ?
"detachable" :
"", sizeof_kmp_task_t,
1370 sizeof_shareds, task_entry));
1372 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1373 sizeof_shareds, task_entry);
1375 KA_TRACE(20, (
"__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1380 kmp_task_t *__kmpc_omp_target_task_alloc(
ident_t *loc_ref, kmp_int32 gtid,
1382 size_t sizeof_kmp_task_t,
1383 size_t sizeof_shareds,
1384 kmp_routine_entry_t task_entry,
1385 kmp_int64 device_id) {
1386 return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1387 sizeof_shareds, task_entry);
1405 kmp_task_t *new_task, kmp_int32 naffins,
1406 kmp_task_affinity_info_t *affin_list) {
1415 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1416 kmp_taskdata_t *current_task) {
1417 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1421 30, (
"__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1422 gtid, taskdata, current_task));
1423 KMP_DEBUG_ASSERT(task);
1424 if (taskdata->td_flags.proxy == TASK_PROXY &&
1425 taskdata->td_flags.complete == 1) {
1430 (
"__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1433 __kmp_bottom_half_finish_proxy(gtid, task);
1435 KA_TRACE(30, (
"__kmp_invoke_task(exit): T#%d completed bottom finish for "
1436 "proxy task %p, resuming task %p\n",
1437 gtid, taskdata, current_task));
1445 ompt_thread_info_t oldInfo;
1446 if (UNLIKELY(ompt_enabled.enabled)) {
1448 thread = __kmp_threads[gtid];
1449 oldInfo = thread->th.ompt_thread_info;
1450 thread->th.ompt_thread_info.wait_id = 0;
1451 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1452 ? ompt_state_work_serial
1453 : ompt_state_work_parallel;
1454 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1459 if (taskdata->td_flags.proxy != TASK_PROXY) {
1460 ANNOTATE_HAPPENS_AFTER(task);
1461 __kmp_task_start(gtid, task, current_task);
1467 if (__kmp_omp_cancellation) {
1468 thread = __kmp_threads[gtid];
1469 kmp_team_t *this_team = thread->th.th_team;
1470 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1471 if ((taskgroup && taskgroup->cancel_request) ||
1472 (this_team->t.t_cancel_request == cancel_parallel)) {
1473 #if OMPT_SUPPORT && OMPT_OPTIONAL
1474 ompt_data_t *task_data;
1475 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1476 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1477 ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1479 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1480 : ompt_cancel_parallel) |
1481 ompt_cancel_discarded_task,
1494 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1495 taskdata->td_last_tied = current_task->td_last_tied;
1496 KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1498 #if KMP_STATS_ENABLED
1500 switch (KMP_GET_THREAD_STATE()) {
1501 case FORK_JOIN_BARRIER:
1502 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1505 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1508 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1511 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1514 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1517 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1520 #endif // KMP_STATS_ENABLED
1524 if (UNLIKELY(ompt_enabled.enabled))
1525 __ompt_task_start(task, current_task, gtid);
1528 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1529 kmp_uint64 cur_time;
1530 kmp_int32 kmp_itt_count_task =
1531 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1532 current_task->td_flags.tasktype == TASK_IMPLICIT;
1533 if (kmp_itt_count_task) {
1534 thread = __kmp_threads[gtid];
1536 if (thread->th.th_bar_arrive_time)
1537 cur_time = __itt_get_timestamp();
1539 kmp_itt_count_task = 0;
1543 #ifdef KMP_GOMP_COMPAT
1544 if (taskdata->td_flags.native) {
1545 ((void (*)(
void *))(*(task->routine)))(task->shareds);
1549 (*(task->routine))(gtid, task);
1551 KMP_POP_PARTITIONED_TIMER();
1553 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1554 if (kmp_itt_count_task) {
1556 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1564 if (taskdata->td_flags.proxy != TASK_PROXY) {
1565 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1567 if (UNLIKELY(ompt_enabled.enabled)) {
1568 thread->th.ompt_thread_info = oldInfo;
1569 if (taskdata->td_flags.tiedness == TASK_TIED) {
1570 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1572 __kmp_task_finish<true>(gtid, task, current_task);
1575 __kmp_task_finish<false>(gtid, task, current_task);
1580 (
"__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1581 gtid, taskdata, current_task));
1595 kmp_int32 __kmpc_omp_task_parts(
ident_t *loc_ref, kmp_int32 gtid,
1596 kmp_task_t *new_task) {
1597 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1599 KA_TRACE(10, (
"__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1600 loc_ref, new_taskdata));
1603 kmp_taskdata_t *parent;
1604 if (UNLIKELY(ompt_enabled.enabled)) {
1605 parent = new_taskdata->td_parent;
1606 if (ompt_enabled.ompt_callback_task_create) {
1607 ompt_data_t task_data = ompt_data_none;
1608 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1609 parent ? &(parent->ompt_task_info.task_data) : &task_data,
1610 parent ? &(parent->ompt_task_info.frame) : NULL,
1611 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1612 OMPT_GET_RETURN_ADDRESS(0));
1620 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED)
1622 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1623 new_taskdata->td_flags.task_serial = 1;
1624 __kmp_invoke_task(gtid, new_task, current_task);
1629 (
"__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1630 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1631 gtid, loc_ref, new_taskdata));
1633 ANNOTATE_HAPPENS_BEFORE(new_task);
1635 if (UNLIKELY(ompt_enabled.enabled)) {
1636 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1639 return TASK_CURRENT_NOT_QUEUED;
1653 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1654 bool serialize_immediate) {
1655 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1659 if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1660 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED)
1662 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1663 if (serialize_immediate)
1664 new_taskdata->td_flags.task_serial = 1;
1665 __kmp_invoke_task(gtid, new_task, current_task);
1668 ANNOTATE_HAPPENS_BEFORE(new_task);
1669 return TASK_CURRENT_NOT_QUEUED;
1684 kmp_int32 __kmpc_omp_task(
ident_t *loc_ref, kmp_int32 gtid,
1685 kmp_task_t *new_task) {
1687 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1689 #if KMP_DEBUG || OMPT_SUPPORT
1690 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1692 KA_TRACE(10, (
"__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1696 kmp_taskdata_t *parent = NULL;
1697 if (UNLIKELY(ompt_enabled.enabled)) {
1698 if (!new_taskdata->td_flags.started) {
1699 OMPT_STORE_RETURN_ADDRESS(gtid);
1700 parent = new_taskdata->td_parent;
1701 if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1702 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1704 if (ompt_enabled.ompt_callback_task_create) {
1705 ompt_data_t task_data = ompt_data_none;
1706 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1707 parent ? &(parent->ompt_task_info.task_data) : &task_data,
1708 parent ? &(parent->ompt_task_info.frame) : NULL,
1709 &(new_taskdata->ompt_task_info.task_data),
1710 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1711 OMPT_LOAD_RETURN_ADDRESS(gtid));
1716 __ompt_task_finish(new_task,
1717 new_taskdata->ompt_task_info.scheduling_parent,
1719 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1724 res = __kmp_omp_task(gtid, new_task,
true);
1726 KA_TRACE(10, (
"__kmpc_omp_task(exit): T#%d returning "
1727 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1728 gtid, loc_ref, new_taskdata));
1730 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1731 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1750 kmp_int32 __kmp_omp_taskloop_task(
ident_t *loc_ref, kmp_int32 gtid,
1751 kmp_task_t *new_task,
void *codeptr_ra) {
1753 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1755 #if KMP_DEBUG || OMPT_SUPPORT
1756 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1758 KA_TRACE(10, (
"__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1762 kmp_taskdata_t *parent = NULL;
1763 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1764 parent = new_taskdata->td_parent;
1765 if (!parent->ompt_task_info.frame.enter_frame.ptr)
1766 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1767 if (ompt_enabled.ompt_callback_task_create) {
1768 ompt_data_t task_data = ompt_data_none;
1769 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1770 parent ? &(parent->ompt_task_info.task_data) : &task_data,
1771 parent ? &(parent->ompt_task_info.frame) : NULL,
1772 &(new_taskdata->ompt_task_info.task_data),
1773 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1779 res = __kmp_omp_task(gtid, new_task,
true);
1781 KA_TRACE(10, (
"__kmpc_omp_task(exit): T#%d returning "
1782 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1783 gtid, loc_ref, new_taskdata));
1785 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1786 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1792 template <
bool ompt>
1793 static kmp_int32 __kmpc_omp_taskwait_template(
ident_t *loc_ref, kmp_int32 gtid,
1794 void *frame_address,
1795 void *return_address) {
1796 kmp_taskdata_t *taskdata;
1798 int thread_finished = FALSE;
1799 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1801 KA_TRACE(10, (
"__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1803 if (__kmp_tasking_mode != tskm_immediate_exec) {
1804 thread = __kmp_threads[gtid];
1805 taskdata = thread->th.th_current_task;
1807 #if OMPT_SUPPORT && OMPT_OPTIONAL
1808 ompt_data_t *my_task_data;
1809 ompt_data_t *my_parallel_data;
1812 my_task_data = &(taskdata->ompt_task_info.task_data);
1813 my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1815 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1817 if (ompt_enabled.ompt_callback_sync_region) {
1818 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1819 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1820 my_task_data, return_address);
1823 if (ompt_enabled.ompt_callback_sync_region_wait) {
1824 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1825 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1826 my_task_data, return_address);
1829 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1836 taskdata->td_taskwait_counter += 1;
1837 taskdata->td_taskwait_ident = loc_ref;
1838 taskdata->td_taskwait_thread = gtid + 1;
1841 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1842 if (itt_sync_obj != NULL)
1843 __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1847 !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1849 must_wait = must_wait || (thread->th.th_task_team != NULL &&
1850 thread->th.th_task_team->tt.tt_found_proxy_tasks);
1852 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1853 &(taskdata->td_incomplete_child_tasks)),
1855 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1856 flag.execute_tasks(thread, gtid, FALSE,
1857 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1858 __kmp_task_stealing_constraint);
1862 if (itt_sync_obj != NULL)
1863 __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1868 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1870 #if OMPT_SUPPORT && OMPT_OPTIONAL
1872 if (ompt_enabled.ompt_callback_sync_region_wait) {
1873 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1874 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1875 my_task_data, return_address);
1877 if (ompt_enabled.ompt_callback_sync_region) {
1878 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1879 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1880 my_task_data, return_address);
1882 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1884 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1886 ANNOTATE_HAPPENS_AFTER(taskdata);
1889 KA_TRACE(10, (
"__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1890 "returning TASK_CURRENT_NOT_QUEUED\n",
1893 return TASK_CURRENT_NOT_QUEUED;
1896 #if OMPT_SUPPORT && OMPT_OPTIONAL
1898 static kmp_int32 __kmpc_omp_taskwait_ompt(
ident_t *loc_ref, kmp_int32 gtid,
1899 void *frame_address,
1900 void *return_address) {
1901 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1904 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1908 kmp_int32 __kmpc_omp_taskwait(
ident_t *loc_ref, kmp_int32 gtid) {
1909 #if OMPT_SUPPORT && OMPT_OPTIONAL
1910 if (UNLIKELY(ompt_enabled.enabled)) {
1911 OMPT_STORE_RETURN_ADDRESS(gtid);
1912 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1913 OMPT_LOAD_RETURN_ADDRESS(gtid));
1916 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1920 kmp_int32 __kmpc_omp_taskyield(
ident_t *loc_ref, kmp_int32 gtid,
int end_part) {
1921 kmp_taskdata_t *taskdata;
1923 int thread_finished = FALSE;
1926 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1928 KA_TRACE(10, (
"__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1929 gtid, loc_ref, end_part));
1931 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1932 thread = __kmp_threads[gtid];
1933 taskdata = thread->th.th_current_task;
1940 taskdata->td_taskwait_counter += 1;
1941 taskdata->td_taskwait_ident = loc_ref;
1942 taskdata->td_taskwait_thread = gtid + 1;
1945 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1946 if (itt_sync_obj != NULL)
1947 __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1949 if (!taskdata->td_flags.team_serial) {
1950 kmp_task_team_t *task_team = thread->th.th_task_team;
1951 if (task_team != NULL) {
1952 if (KMP_TASKING_ENABLED(task_team)) {
1954 if (UNLIKELY(ompt_enabled.enabled))
1955 thread->th.ompt_thread_info.ompt_task_yielded = 1;
1957 __kmp_execute_tasks_32(
1958 thread, gtid, NULL, FALSE,
1959 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1960 __kmp_task_stealing_constraint);
1962 if (UNLIKELY(ompt_enabled.enabled))
1963 thread->th.ompt_thread_info.ompt_task_yielded = 0;
1969 if (itt_sync_obj != NULL)
1970 __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1975 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1978 KA_TRACE(10, (
"__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1979 "returning TASK_CURRENT_NOT_QUEUED\n",
1982 return TASK_CURRENT_NOT_QUEUED;
2003 unsigned reserved31 : 31;
2083 template <
typename T>
2084 void *__kmp_task_reduction_init(
int gtid,
int num, T *data) {
2085 kmp_info_t *thread = __kmp_threads[gtid];
2086 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2087 kmp_int32 nth = thread->th.th_team_nproc;
2091 KMP_ASSERT(tg != NULL);
2092 KMP_ASSERT(data != NULL);
2093 KMP_ASSERT(num > 0);
2095 KA_TRACE(10, (
"__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2099 KA_TRACE(10, (
"__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2103 for (
int i = 0; i < num; ++i) {
2104 size_t size = data[i].reduce_size - 1;
2106 size += CACHE_LINE - size % CACHE_LINE;
2107 KMP_ASSERT(data[i].reduce_comb != NULL);
2110 arr[i].
flags = data[i].flags;
2114 __kmp_assign_orig<T>(arr[i], data[i]);
2115 if (!arr[i].flags.lazy_priv) {
2118 arr[i].
reduce_pend = (
char *)(arr[i].reduce_priv) + nth * size;
2119 if (arr[i].reduce_init != NULL) {
2121 for (
int j = 0; j < nth; ++j) {
2122 __kmp_call_init<T>(arr[i], j * size);
2129 arr[i].
reduce_priv = __kmp_allocate(nth *
sizeof(
void *));
2132 tg->reduce_data = (
void *)arr;
2133 tg->reduce_num_data = num;
2172 template <
typename T>
2173 void __kmp_task_reduction_init_copy(kmp_info_t *thr,
int num, T *data,
2174 kmp_taskgroup_t *tg,
void *reduce_data) {
2176 KA_TRACE(20, (
"__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2178 thr, tg, reduce_data));
2183 for (
int i = 0; i < num; ++i) {
2186 tg->reduce_data = (
void *)arr;
2187 tg->reduce_num_data = num;
2200 kmp_info_t *thread = __kmp_threads[gtid];
2201 kmp_int32 nth = thread->th.th_team_nproc;
2205 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2207 tg = thread->th.th_current_task->td_taskgroup;
2208 KMP_ASSERT(tg != NULL);
2210 kmp_int32 num = tg->reduce_num_data;
2211 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2213 KMP_ASSERT(data != NULL);
2214 while (tg != NULL) {
2215 for (
int i = 0; i < num; ++i) {
2216 if (!arr[i].flags.lazy_priv) {
2217 if (data == arr[i].reduce_shar ||
2218 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2219 return (
char *)(arr[i].
reduce_priv) + tid * arr[i].reduce_size;
2222 void **p_priv = (
void **)(arr[i].reduce_priv);
2223 if (data == arr[i].reduce_shar)
2226 for (
int j = 0; j < nth; ++j)
2227 if (data == p_priv[j])
2231 if (p_priv[tid] == NULL) {
2233 p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2234 if (arr[i].reduce_init != NULL) {
2235 if (arr[i].reduce_orig != NULL) {
2237 p_priv[tid], arr[i].reduce_orig);
2239 ((void (*)(
void *))arr[i].
reduce_init)(p_priv[tid]);
2248 num = tg->reduce_num_data;
2250 KMP_ASSERT2(0,
"Unknown task reduction item");
2256 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2257 kmp_int32 nth = th->th.th_team_nproc;
2258 KMP_DEBUG_ASSERT(nth > 1);
2260 kmp_int32 num = tg->reduce_num_data;
2261 for (
int i = 0; i < num; ++i) {
2263 void (*f_fini)(
void *) = (
void (*)(
void *))(arr[i].
reduce_fini);
2264 void (*f_comb)(
void *,
void *) =
2266 if (!arr[i].flags.lazy_priv) {
2269 for (
int j = 0; j < nth; ++j) {
2270 void *priv_data = (
char *)pr_data + j * size;
2271 f_comb(sh_data, priv_data);
2276 void **pr_data = (
void **)(arr[i].reduce_priv);
2277 for (
int j = 0; j < nth; ++j) {
2278 if (pr_data[j] != NULL) {
2279 f_comb(sh_data, pr_data[j]);
2282 __kmp_free(pr_data[j]);
2286 __kmp_free(arr[i].reduce_priv);
2288 __kmp_thread_free(th, arr);
2289 tg->reduce_data = NULL;
2290 tg->reduce_num_data = 0;
2296 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2297 __kmp_thread_free(th, tg->reduce_data);
2298 tg->reduce_data = NULL;
2299 tg->reduce_num_data = 0;
2302 template <
typename T>
2303 void *__kmp_task_reduction_modifier_init(
ident_t *loc,
int gtid,
int is_ws,
2305 kmp_info_t *thr = __kmp_threads[gtid];
2306 kmp_int32 nth = thr->th.th_team_nproc;
2307 __kmpc_taskgroup(loc, gtid);
2310 (
"__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2311 gtid, thr->th.th_current_task->td_taskgroup));
2312 return (
void *)thr->th.th_current_task->td_taskgroup;
2314 kmp_team_t *team = thr->th.th_team;
2316 kmp_taskgroup_t *tg;
2317 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2318 if (reduce_data == NULL &&
2319 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2322 KMP_DEBUG_ASSERT(reduce_data == NULL);
2324 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2328 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2329 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2330 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2333 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2337 KMP_DEBUG_ASSERT(reduce_data > (
void *)1);
2338 tg = thr->th.th_current_task->td_taskgroup;
2339 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2361 int num,
void *data) {
2362 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2382 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2395 __kmpc_end_taskgroup(loc, gtid);
2399 void __kmpc_taskgroup(
ident_t *loc,
int gtid) {
2400 kmp_info_t *thread = __kmp_threads[gtid];
2401 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2402 kmp_taskgroup_t *tg_new =
2403 (kmp_taskgroup_t *)__kmp_thread_malloc(thread,
sizeof(kmp_taskgroup_t));
2404 KA_TRACE(10, (
"__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2405 KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2406 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2407 tg_new->parent = taskdata->td_taskgroup;
2408 tg_new->reduce_data = NULL;
2409 tg_new->reduce_num_data = 0;
2410 taskdata->td_taskgroup = tg_new;
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL
2413 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2414 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2416 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2417 kmp_team_t *team = thread->th.th_team;
2418 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2420 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2422 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2423 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2424 &(my_task_data), codeptr);
2431 void __kmpc_end_taskgroup(
ident_t *loc,
int gtid) {
2432 kmp_info_t *thread = __kmp_threads[gtid];
2433 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2434 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2435 int thread_finished = FALSE;
2437 #if OMPT_SUPPORT && OMPT_OPTIONAL
2439 ompt_data_t my_task_data;
2440 ompt_data_t my_parallel_data;
2442 if (UNLIKELY(ompt_enabled.enabled)) {
2443 team = thread->th.th_team;
2444 my_task_data = taskdata->ompt_task_info.task_data;
2446 my_parallel_data = team->t.ompt_team_info.parallel_data;
2447 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2449 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2453 KA_TRACE(10, (
"__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2454 KMP_DEBUG_ASSERT(taskgroup != NULL);
2455 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2457 if (__kmp_tasking_mode != tskm_immediate_exec) {
2459 taskdata->td_taskwait_counter += 1;
2460 taskdata->td_taskwait_ident = loc;
2461 taskdata->td_taskwait_thread = gtid + 1;
2465 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2466 if (itt_sync_obj != NULL)
2467 __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2470 #if OMPT_SUPPORT && OMPT_OPTIONAL
2471 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2472 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2473 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2474 &(my_task_data), codeptr);
2478 if (!taskdata->td_flags.team_serial ||
2479 (thread->th.th_task_team != NULL &&
2480 thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
2481 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2483 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2484 flag.execute_tasks(thread, gtid, FALSE,
2485 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2486 __kmp_task_stealing_constraint);
2489 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2491 #if OMPT_SUPPORT && OMPT_OPTIONAL
2492 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2493 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2494 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2495 &(my_task_data), codeptr);
2500 if (itt_sync_obj != NULL)
2501 __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2504 KMP_DEBUG_ASSERT(taskgroup->count == 0);
2506 if (taskgroup->reduce_data != NULL) {
2509 kmp_team_t *t = thread->th.th_team;
2513 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2516 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2517 if (cnt == thread->th.th_team_nproc - 1) {
2520 __kmp_task_reduction_fini(thread, taskgroup);
2523 __kmp_thread_free(thread, reduce_data);
2524 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2525 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2529 __kmp_task_reduction_clean(thread, taskgroup);
2531 }
else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2535 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2536 if (cnt == thread->th.th_team_nproc - 1) {
2538 __kmp_task_reduction_fini(thread, taskgroup);
2541 __kmp_thread_free(thread, reduce_data);
2542 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2543 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2547 __kmp_task_reduction_clean(thread, taskgroup);
2551 __kmp_task_reduction_fini(thread, taskgroup);
2555 taskdata->td_taskgroup = taskgroup->parent;
2556 __kmp_thread_free(thread, taskgroup);
2558 KA_TRACE(10, (
"__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2560 ANNOTATE_HAPPENS_AFTER(taskdata);
2562 #if OMPT_SUPPORT && OMPT_OPTIONAL
2563 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2564 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2565 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2566 &(my_task_data), codeptr);
2572 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2573 kmp_task_team_t *task_team,
2574 kmp_int32 is_constrained) {
2576 kmp_taskdata_t *taskdata;
2577 kmp_thread_data_t *thread_data;
2580 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2581 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2584 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2586 KA_TRACE(10, (
"__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2587 gtid, thread_data->td.td_deque_ntasks,
2588 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2590 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2592 (
"__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2593 "ntasks=%d head=%u tail=%u\n",
2594 gtid, thread_data->td.td_deque_ntasks,
2595 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2599 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2601 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2602 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2604 (
"__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2605 "ntasks=%d head=%u tail=%u\n",
2606 gtid, thread_data->td.td_deque_ntasks,
2607 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2611 tail = (thread_data->td.td_deque_tail - 1) &
2612 TASK_DEQUE_MASK(thread_data->td);
2613 taskdata = thread_data->td.td_deque[tail];
2615 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2616 thread->th.th_current_task)) {
2618 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2620 (
"__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2621 "ntasks=%d head=%u tail=%u\n",
2622 gtid, thread_data->td.td_deque_ntasks,
2623 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2627 thread_data->td.td_deque_tail = tail;
2628 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2630 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2632 KA_TRACE(10, (
"__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2633 "ntasks=%d head=%u tail=%u\n",
2634 gtid, taskdata, thread_data->td.td_deque_ntasks,
2635 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2637 task = KMP_TASKDATA_TO_TASK(taskdata);
2644 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2645 kmp_task_team_t *task_team,
2646 std::atomic<kmp_int32> *unfinished_threads,
2647 int *thread_finished,
2648 kmp_int32 is_constrained) {
2650 kmp_taskdata_t *taskdata;
2651 kmp_taskdata_t *current;
2652 kmp_thread_data_t *victim_td, *threads_data;
2654 kmp_int32 victim_tid;
2656 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2658 threads_data = task_team->tt.tt_threads_data;
2659 KMP_DEBUG_ASSERT(threads_data != NULL);
2661 victim_tid = victim_thr->th.th_info.ds.ds_tid;
2662 victim_td = &threads_data[victim_tid];
2664 KA_TRACE(10, (
"__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2665 "task_team=%p ntasks=%d head=%u tail=%u\n",
2666 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2667 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2668 victim_td->td.td_deque_tail));
2670 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2671 KA_TRACE(10, (
"__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2672 "task_team=%p ntasks=%d head=%u tail=%u\n",
2673 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2674 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2675 victim_td->td.td_deque_tail));
2679 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2681 int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2684 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2685 KA_TRACE(10, (
"__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2686 "task_team=%p ntasks=%d head=%u tail=%u\n",
2687 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2688 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2692 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2693 current = __kmp_threads[gtid]->th.th_current_task;
2694 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2695 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2697 victim_td->td.td_deque_head =
2698 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2700 if (!task_team->tt.tt_untied_task_encountered) {
2702 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2703 KA_TRACE(10, (
"__kmp_steal_task(exit #3): T#%d could not steal from "
2704 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2705 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2706 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2711 target = victim_td->td.td_deque_head;
2713 for (i = 1; i < ntasks; ++i) {
2714 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2715 taskdata = victim_td->td.td_deque[target];
2716 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2722 if (taskdata == NULL) {
2724 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2725 KA_TRACE(10, (
"__kmp_steal_task(exit #4): T#%d could not steal from "
2726 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2727 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2728 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2732 for (i = i + 1; i < ntasks; ++i) {
2734 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2735 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2739 victim_td->td.td_deque_tail ==
2740 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2741 victim_td->td.td_deque_tail = target;
2743 if (*thread_finished) {
2749 count = KMP_ATOMIC_INC(unfinished_threads);
2753 (
"__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2754 gtid, count + 1, task_team));
2756 *thread_finished = FALSE;
2758 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2760 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2764 (
"__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2765 "task_team=%p ntasks=%d head=%u tail=%u\n",
2766 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2767 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2769 task = KMP_TASKDATA_TO_TASK(taskdata);
2783 static inline int __kmp_execute_tasks_template(
2784 kmp_info_t *thread, kmp_int32 gtid, C *flag,
int final_spin,
2785 int *thread_finished USE_ITT_BUILD_ARG(
void *itt_sync_obj),
2786 kmp_int32 is_constrained) {
2787 kmp_task_team_t *task_team = thread->th.th_task_team;
2788 kmp_thread_data_t *threads_data;
2790 kmp_info_t *other_thread;
2791 kmp_taskdata_t *current_task = thread->th.th_current_task;
2792 std::atomic<kmp_int32> *unfinished_threads;
2793 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2794 tid = thread->th.th_info.ds.ds_tid;
2796 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2797 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2799 if (task_team == NULL || current_task == NULL)
2802 KA_TRACE(15, (
"__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2803 "*thread_finished=%d\n",
2804 gtid, final_spin, *thread_finished));
2806 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2807 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2808 KMP_DEBUG_ASSERT(threads_data != NULL);
2810 nthreads = task_team->tt.tt_nproc;
2811 unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2812 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2813 KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2819 if (use_own_tasks) {
2820 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2822 if ((task == NULL) && (nthreads > 1)) {
2826 if (victim_tid == -2) {
2827 victim_tid = threads_data[tid].td.td_deque_last_stolen;
2830 other_thread = threads_data[victim_tid].td.td_thr;
2832 if (victim_tid != -1) {
2834 }
else if (!new_victim) {
2840 victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2841 if (victim_tid >= tid) {
2845 other_thread = threads_data[victim_tid].td.td_thr;
2855 if ((__kmp_tasking_mode == tskm_task_teams) &&
2856 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2857 (TCR_PTR(CCAST(
void *, other_thread->th.th_sleep_loc)) !=
2860 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2861 other_thread->th.th_sleep_loc);
2874 task = __kmp_steal_task(other_thread, gtid, task_team,
2875 unfinished_threads, thread_finished,
2879 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2880 threads_data[tid].td.td_deque_last_stolen = victim_tid;
2887 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2896 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2897 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2898 if (itt_sync_obj == NULL) {
2900 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2902 __kmp_itt_task_starting(itt_sync_obj);
2905 __kmp_invoke_task(gtid, task, current_task);
2907 if (itt_sync_obj != NULL)
2908 __kmp_itt_task_finished(itt_sync_obj);
2915 if (flag == NULL || (!final_spin && flag->done_check())) {
2918 (
"__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2922 if (thread->th.th_task_team == NULL) {
2925 KMP_YIELD(__kmp_library == library_throughput);
2928 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2929 KA_TRACE(20, (
"__kmp_execute_tasks_template: T#%d stolen task spawned "
2930 "other tasks, restart\n",
2941 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) {
2945 if (!*thread_finished) {
2948 count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
2949 KA_TRACE(20, (
"__kmp_execute_tasks_template: T#%d dec "
2950 "unfinished_threads to %d task_team=%p\n",
2951 gtid, count, task_team));
2952 *thread_finished = TRUE;
2960 if (flag != NULL && flag->done_check()) {
2963 (
"__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2971 if (thread->th.th_task_team == NULL) {
2973 (
"__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2983 (
"__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
2989 int __kmp_execute_tasks_32(
2990 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag,
int final_spin,
2991 int *thread_finished USE_ITT_BUILD_ARG(
void *itt_sync_obj),
2992 kmp_int32 is_constrained) {
2993 return __kmp_execute_tasks_template(
2994 thread, gtid, flag, final_spin,
2995 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2998 int __kmp_execute_tasks_64(
2999 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag,
int final_spin,
3000 int *thread_finished USE_ITT_BUILD_ARG(
void *itt_sync_obj),
3001 kmp_int32 is_constrained) {
3002 return __kmp_execute_tasks_template(
3003 thread, gtid, flag, final_spin,
3004 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3007 int __kmp_execute_tasks_oncore(
3008 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag,
int final_spin,
3009 int *thread_finished USE_ITT_BUILD_ARG(
void *itt_sync_obj),
3010 kmp_int32 is_constrained) {
3011 return __kmp_execute_tasks_template(
3012 thread, gtid, flag, final_spin,
3013 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3019 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3020 kmp_info_t *this_thr) {
3021 kmp_thread_data_t *threads_data;
3022 int nthreads, i, is_init_thread;
3024 KA_TRACE(10, (
"__kmp_enable_tasking(enter): T#%d\n",
3025 __kmp_gtid_from_thread(this_thr)));
3027 KMP_DEBUG_ASSERT(task_team != NULL);
3028 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3030 nthreads = task_team->tt.tt_nproc;
3031 KMP_DEBUG_ASSERT(nthreads > 0);
3032 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3035 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3037 if (!is_init_thread) {
3041 (
"__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3042 __kmp_gtid_from_thread(this_thr)));
3045 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3046 KMP_DEBUG_ASSERT(threads_data != NULL);
3048 if (__kmp_tasking_mode == tskm_task_teams &&
3049 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3053 for (i = 0; i < nthreads; i++) {
3054 volatile void *sleep_loc;
3055 kmp_info_t *thread = threads_data[i].td.td_thr;
3057 if (i == this_thr->th.th_info.ds.ds_tid) {
3066 if ((sleep_loc = TCR_PTR(CCAST(
void *, thread->th.th_sleep_loc))) !=
3068 KF_TRACE(50, (
"__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3069 __kmp_gtid_from_thread(this_thr),
3070 __kmp_gtid_from_thread(thread)));
3071 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3073 KF_TRACE(50, (
"__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3074 __kmp_gtid_from_thread(this_thr),
3075 __kmp_gtid_from_thread(thread)));
3080 KA_TRACE(10, (
"__kmp_enable_tasking(exit): T#%d\n",
3081 __kmp_gtid_from_thread(this_thr)));
3114 static kmp_task_team_t *__kmp_free_task_teams =
3117 kmp_bootstrap_lock_t __kmp_task_team_lock =
3118 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3125 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3126 kmp_thread_data_t *thread_data) {
3127 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3128 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3131 thread_data->td.td_deque_last_stolen = -1;
3133 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3134 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3135 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3139 (
"__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3140 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3144 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3145 INITIAL_TASK_DEQUE_SIZE *
sizeof(kmp_taskdata_t *));
3146 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3152 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3153 if (thread_data->td.td_deque != NULL) {
3154 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3155 TCW_4(thread_data->td.td_deque_ntasks, 0);
3156 __kmp_free(thread_data->td.td_deque);
3157 thread_data->td.td_deque = NULL;
3158 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3161 #ifdef BUILD_TIED_TASK_STACK
3163 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3164 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3166 #endif // BUILD_TIED_TASK_STACK
3176 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3177 kmp_task_team_t *task_team) {
3178 kmp_thread_data_t **threads_data_p;
3179 kmp_int32 nthreads, maxthreads;
3180 int is_init_thread = FALSE;
3182 if (TCR_4(task_team->tt.tt_found_tasks)) {
3187 threads_data_p = &task_team->tt.tt_threads_data;
3188 nthreads = task_team->tt.tt_nproc;
3189 maxthreads = task_team->tt.tt_max_threads;
3194 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3196 if (!TCR_4(task_team->tt.tt_found_tasks)) {
3198 kmp_team_t *team = thread->th.th_team;
3201 is_init_thread = TRUE;
3202 if (maxthreads < nthreads) {
3204 if (*threads_data_p != NULL) {
3205 kmp_thread_data_t *old_data = *threads_data_p;
3206 kmp_thread_data_t *new_data = NULL;
3210 (
"__kmp_realloc_task_threads_data: T#%d reallocating "
3211 "threads data for task_team %p, new_size = %d, old_size = %d\n",
3212 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3217 new_data = (kmp_thread_data_t *)__kmp_allocate(
3218 nthreads *
sizeof(kmp_thread_data_t));
3220 KMP_MEMCPY_S((
void *)new_data, nthreads *
sizeof(kmp_thread_data_t),
3221 (
void *)old_data, maxthreads *
sizeof(kmp_thread_data_t));
3223 #ifdef BUILD_TIED_TASK_STACK
3225 for (i = maxthreads; i < nthreads; i++) {
3226 kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3227 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3229 #endif // BUILD_TIED_TASK_STACK
3231 (*threads_data_p) = new_data;
3232 __kmp_free(old_data);
3234 KE_TRACE(10, (
"__kmp_realloc_task_threads_data: T#%d allocating "
3235 "threads data for task_team %p, size = %d\n",
3236 __kmp_gtid_from_thread(thread), task_team, nthreads));
3240 ANNOTATE_IGNORE_WRITES_BEGIN();
3241 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3242 nthreads *
sizeof(kmp_thread_data_t));
3243 ANNOTATE_IGNORE_WRITES_END();
3244 #ifdef BUILD_TIED_TASK_STACK
3246 for (i = 0; i < nthreads; i++) {
3247 kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3248 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3250 #endif // BUILD_TIED_TASK_STACK
3252 task_team->tt.tt_max_threads = nthreads;
3255 KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3259 for (i = 0; i < nthreads; i++) {
3260 kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3261 thread_data->td.td_thr = team->t.t_threads[i];
3263 if (thread_data->td.td_deque_last_stolen >= nthreads) {
3267 thread_data->td.td_deque_last_stolen = -1;
3272 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3275 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3276 return is_init_thread;
3282 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3283 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3284 if (task_team->tt.tt_threads_data != NULL) {
3286 for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3287 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3289 __kmp_free(task_team->tt.tt_threads_data);
3290 task_team->tt.tt_threads_data = NULL;
3292 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3299 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3301 kmp_task_team_t *task_team = NULL;
3304 KA_TRACE(20, (
"__kmp_allocate_task_team: T#%d entering; team = %p\n",
3305 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3307 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3309 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3310 if (__kmp_free_task_teams != NULL) {
3311 task_team = __kmp_free_task_teams;
3312 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3313 task_team->tt.tt_next = NULL;
3315 __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3318 if (task_team == NULL) {
3319 KE_TRACE(10, (
"__kmp_allocate_task_team: T#%d allocating "
3320 "task team for team %p\n",
3321 __kmp_gtid_from_thread(thread), team));
3325 task_team = (kmp_task_team_t *)__kmp_allocate(
sizeof(kmp_task_team_t));
3326 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3333 TCW_4(task_team->tt.tt_found_tasks, FALSE);
3334 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3335 task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3337 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3338 TCW_4(task_team->tt.tt_active, TRUE);
3340 KA_TRACE(20, (
"__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3341 "unfinished_threads init'd to %d\n",
3342 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3343 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3350 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3351 KA_TRACE(20, (
"__kmp_free_task_team: T#%d task_team = %p\n",
3352 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3355 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3357 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3358 task_team->tt.tt_next = __kmp_free_task_teams;
3359 TCW_PTR(__kmp_free_task_teams, task_team);
3361 __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3369 void __kmp_reap_task_teams(
void) {
3370 kmp_task_team_t *task_team;
3372 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3374 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3375 while ((task_team = __kmp_free_task_teams) != NULL) {
3376 __kmp_free_task_teams = task_team->tt.tt_next;
3377 task_team->tt.tt_next = NULL;
3380 if (task_team->tt.tt_threads_data != NULL) {
3381 __kmp_free_task_threads_data(task_team);
3383 __kmp_free(task_team);
3385 __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3392 void __kmp_wait_to_unref_task_teams(
void) {
3397 KMP_INIT_YIELD(spins);
3405 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3406 thread = thread->th.th_next_pool) {
3410 if (TCR_PTR(thread->th.th_task_team) == NULL) {
3411 KA_TRACE(10, (
"__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3412 __kmp_gtid_from_thread(thread)));
3417 if (!__kmp_is_thread_alive(thread, &exit_val)) {
3418 thread->th.th_task_team = NULL;
3425 KA_TRACE(10, (
"__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3426 "unreference task_team\n",
3427 __kmp_gtid_from_thread(thread)));
3429 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3430 volatile void *sleep_loc;
3432 if ((sleep_loc = TCR_PTR(CCAST(
void *, thread->th.th_sleep_loc))) !=
3436 (
"__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3437 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3438 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3447 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3453 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
int always) {
3454 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3460 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3461 (always || team->t.t_nproc > 1)) {
3462 team->t.t_task_team[this_thr->th.th_task_state] =
3463 __kmp_allocate_task_team(this_thr, team);
3464 KA_TRACE(20, (
"__kmp_task_team_setup: Master T#%d created new task_team %p "
3465 "for team %d at parity=%d\n",
3466 __kmp_gtid_from_thread(this_thr),
3467 team->t.t_task_team[this_thr->th.th_task_state],
3468 ((team != NULL) ? team->t.t_id : -1),
3469 this_thr->th.th_task_state));
3479 if (team->t.t_nproc > 1) {
3480 int other_team = 1 - this_thr->th.th_task_state;
3481 if (team->t.t_task_team[other_team] == NULL) {
3482 team->t.t_task_team[other_team] =
3483 __kmp_allocate_task_team(this_thr, team);
3484 KA_TRACE(20, (
"__kmp_task_team_setup: Master T#%d created second new "
3485 "task_team %p for team %d at parity=%d\n",
3486 __kmp_gtid_from_thread(this_thr),
3487 team->t.t_task_team[other_team],
3488 ((team != NULL) ? team->t.t_id : -1), other_team));
3491 kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3492 if (!task_team->tt.tt_active ||
3493 team->t.t_nproc != task_team->tt.tt_nproc) {
3494 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3495 TCW_4(task_team->tt.tt_found_tasks, FALSE);
3496 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3497 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3499 TCW_4(task_team->tt.tt_active, TRUE);
3503 KA_TRACE(20, (
"__kmp_task_team_setup: Master T#%d reset next task_team "
3504 "%p for team %d at parity=%d\n",
3505 __kmp_gtid_from_thread(this_thr),
3506 team->t.t_task_team[other_team],
3507 ((team != NULL) ? team->t.t_id : -1), other_team));
3515 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3516 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3520 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3523 TCW_PTR(this_thr->th.th_task_team,
3524 team->t.t_task_team[this_thr->th.th_task_state]);
3526 (
"__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3527 "%p from Team #%d (parity=%d)\n",
3528 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3529 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3539 void __kmp_task_team_wait(
3540 kmp_info_t *this_thr,
3541 kmp_team_t *team USE_ITT_BUILD_ARG(
void *itt_sync_obj),
int wait) {
3542 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3544 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3545 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3547 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3549 KA_TRACE(20, (
"__kmp_task_team_wait: Master T#%d waiting for all tasks "
3550 "(for unfinished_threads to reach 0) on task_team = %p\n",
3551 __kmp_gtid_from_thread(this_thr), task_team));
3555 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3556 &task_team->tt.tt_unfinished_threads),
3558 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3564 (
"__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3565 "setting active to false, setting local and team's pointer to NULL\n",
3566 __kmp_gtid_from_thread(this_thr), task_team));
3567 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3568 task_team->tt.tt_found_proxy_tasks == TRUE);
3569 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3570 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3571 TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3574 TCW_PTR(this_thr->th.th_task_team, NULL);
3583 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
int gtid) {
3584 std::atomic<kmp_uint32> *spin = RCAST(
3585 std::atomic<kmp_uint32> *,
3586 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3588 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3591 KMP_FSYNC_SPIN_INIT(spin, NULL);
3593 kmp_flag_32 spin_flag(spin, 0U);
3594 while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3595 &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3598 KMP_FSYNC_SPIN_PREPARE(RCAST(
void *, spin));
3601 if (TCR_4(__kmp_global.g.g_done)) {
3602 if (__kmp_global.g.g_abort)
3603 __kmp_abort_thread();
3609 KMP_FSYNC_SPIN_ACQUIRED(RCAST(
void *, spin));
3618 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3620 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3621 kmp_task_team_t *task_team = taskdata->td_task_team;
3623 KA_TRACE(20, (
"__kmp_give_task: trying to give task %p to thread %d.\n",
3627 KMP_DEBUG_ASSERT(task_team != NULL);
3629 bool result =
false;
3630 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3632 if (thread_data->td.td_deque == NULL) {
3636 (
"__kmp_give_task: thread %d has no queue while giving task %p.\n",
3641 if (TCR_4(thread_data->td.td_deque_ntasks) >=
3642 TASK_DEQUE_SIZE(thread_data->td)) {
3645 (
"__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3650 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3653 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3654 __kmp_realloc_task_deque(thread, thread_data);
3658 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3660 if (TCR_4(thread_data->td.td_deque_ntasks) >=
3661 TASK_DEQUE_SIZE(thread_data->td)) {
3662 KA_TRACE(30, (
"__kmp_give_task: queue is full while giving task %p to "
3668 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3669 goto release_and_exit;
3671 __kmp_realloc_task_deque(thread, thread_data);
3677 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3679 thread_data->td.td_deque_tail =
3680 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3681 TCW_4(thread_data->td.td_deque_ntasks,
3682 TCR_4(thread_data->td.td_deque_ntasks) + 1);
3685 KA_TRACE(30, (
"__kmp_give_task: successfully gave task %p to thread %d.\n",
3689 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3710 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3711 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3712 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3713 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3714 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3716 taskdata->td_flags.complete = 1;
3718 if (taskdata->td_taskgroup)
3719 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3723 KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3726 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3727 kmp_int32 children = 0;
3731 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3732 KMP_DEBUG_ASSERT(children >= 0);
3735 KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3738 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3739 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3740 kmp_info_t *thread = __kmp_threads[gtid];
3742 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3743 KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3748 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3751 __kmp_release_deps(gtid, taskdata);
3752 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3764 KMP_DEBUG_ASSERT(ptask != NULL);
3765 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3767 10, (
"__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3770 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3772 __kmp_first_top_half_finish_proxy(taskdata);
3773 __kmp_second_top_half_finish_proxy(taskdata);
3774 __kmp_bottom_half_finish_proxy(gtid, ptask);
3777 (
"__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3789 KMP_DEBUG_ASSERT(ptask != NULL);
3790 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3794 (
"__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3797 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3799 __kmp_first_top_half_finish_proxy(taskdata);
3803 kmp_team_t *team = taskdata->td_team;
3804 kmp_int32 nthreads = team->t.t_nproc;
3809 kmp_int32 start_k = 0;
3811 kmp_int32 k = start_k;
3815 thread = team->t.t_threads[k];
3816 k = (k + 1) % nthreads;
3822 }
while (!__kmp_give_task(thread, k, ptask, pass));
3824 __kmp_second_top_half_finish_proxy(taskdata);
3828 (
"__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3832 kmp_event_t *__kmpc_task_allow_completion_event(
ident_t *loc_ref,
int gtid,
3834 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
3835 if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
3836 td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
3837 td->td_allow_completion_event.ed.task = task;
3838 __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
3840 return &td->td_allow_completion_event;
3843 void __kmp_fulfill_event(kmp_event_t *event) {
3844 if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
3845 kmp_task_t *ptask =
event->ed.task;
3846 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3847 bool detached =
false;
3848 int gtid = __kmp_get_gtid();
3850 if (taskdata->td_flags.proxy == TASK_PROXY) {
3853 event->type = KMP_EVENT_UNINITIALIZED;
3858 __kmp_acquire_tas_lock(&event->lock, gtid);
3859 if (taskdata->td_flags.proxy == TASK_PROXY)
3861 event->type = KMP_EVENT_UNINITIALIZED;
3862 __kmp_release_tas_lock(&event->lock, gtid);
3868 kmp_team_t *team = taskdata->td_team;
3869 kmp_info_t *thread = __kmp_get_thread();
3870 if (thread->th.th_team == team) {
3888 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3890 kmp_taskdata_t *taskdata;
3891 kmp_taskdata_t *taskdata_src;
3892 kmp_taskdata_t *parent_task = thread->th.th_current_task;
3893 size_t shareds_offset;
3896 KA_TRACE(10, (
"__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3898 taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3899 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3901 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3902 task_size = taskdata_src->td_size_alloc;
3905 KA_TRACE(30, (
"__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3908 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3910 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3912 KMP_MEMCPY(taskdata, taskdata_src, task_size);
3914 task = KMP_TASKDATA_TO_TASK(taskdata);
3917 taskdata->td_task_id = KMP_GEN_TASK_ID();
3918 if (task->shareds != NULL) {
3919 shareds_offset = (
char *)task_src->shareds - (
char *)taskdata_src;
3920 task->shareds = &((
char *)taskdata)[shareds_offset];
3921 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (
sizeof(
void *) - 1)) ==
3924 taskdata->td_alloc_thread = thread;
3925 taskdata->td_parent = parent_task;
3926 taskdata->td_taskgroup =
3932 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3933 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
3934 if (parent_task->td_taskgroup)
3935 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
3938 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3939 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
3943 (
"__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3944 thread, taskdata, taskdata->td_parent));
3946 if (UNLIKELY(ompt_enabled.enabled))
3947 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
3956 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3958 KMP_BUILD_ASSERT(
sizeof(
long) == 4 ||
sizeof(
long) == 8);
3963 class kmp_taskloop_bounds_t {
3965 const kmp_taskdata_t *taskdata;
3966 size_t lower_offset;
3967 size_t upper_offset;
3970 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
3971 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
3972 lower_offset((char *)lb - (char *)task),
3973 upper_offset((char *)ub - (char *)task) {
3974 KMP_DEBUG_ASSERT((
char *)lb > (
char *)_task);
3975 KMP_DEBUG_ASSERT((
char *)ub > (
char *)_task);
3977 kmp_taskloop_bounds_t(kmp_task_t *_task,
const kmp_taskloop_bounds_t &bounds)
3978 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
3979 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
3980 size_t get_lower_offset()
const {
return lower_offset; }
3981 size_t get_upper_offset()
const {
return upper_offset; }
3982 kmp_uint64 get_lb()
const {
3984 #if defined(KMP_GOMP_COMPAT)
3986 if (!taskdata->td_flags.native) {
3987 retval = *(kmp_int64 *)((
char *)task + lower_offset);
3990 if (taskdata->td_size_loop_bounds == 4) {
3991 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
3992 retval = (kmp_int64)*lb;
3994 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
3995 retval = (kmp_int64)*lb;
3999 retval = *(kmp_int64 *)((
char *)task + lower_offset);
4000 #endif // defined(KMP_GOMP_COMPAT)
4003 kmp_uint64 get_ub()
const {
4005 #if defined(KMP_GOMP_COMPAT)
4007 if (!taskdata->td_flags.native) {
4008 retval = *(kmp_int64 *)((
char *)task + upper_offset);
4011 if (taskdata->td_size_loop_bounds == 4) {
4012 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4013 retval = (kmp_int64)*ub;
4015 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4016 retval = (kmp_int64)*ub;
4020 retval = *(kmp_int64 *)((
char *)task + upper_offset);
4021 #endif // defined(KMP_GOMP_COMPAT)
4024 void set_lb(kmp_uint64 lb) {
4025 #if defined(KMP_GOMP_COMPAT)
4027 if (!taskdata->td_flags.native) {
4028 *(kmp_uint64 *)((
char *)task + lower_offset) = lb;
4031 if (taskdata->td_size_loop_bounds == 4) {
4032 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4033 *lower = (kmp_uint32)lb;
4035 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4036 *lower = (kmp_uint64)lb;
4040 *(kmp_uint64 *)((
char *)task + lower_offset) = lb;
4041 #endif // defined(KMP_GOMP_COMPAT)
4043 void set_ub(kmp_uint64 ub) {
4044 #if defined(KMP_GOMP_COMPAT)
4046 if (!taskdata->td_flags.native) {
4047 *(kmp_uint64 *)((
char *)task + upper_offset) = ub;
4050 if (taskdata->td_size_loop_bounds == 4) {
4051 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4052 *upper = (kmp_uint32)ub;
4054 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4055 *upper = (kmp_uint64)ub;
4059 *(kmp_uint64 *)((
char *)task + upper_offset) = ub;
4060 #endif // defined(KMP_GOMP_COMPAT)
4079 void __kmp_taskloop_linear(
ident_t *loc,
int gtid, kmp_task_t *task,
4080 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4081 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4082 kmp_uint64 grainsize, kmp_uint64 extras,
4089 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4090 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4092 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4093 kmp_uint64 lower = task_bounds.get_lb();
4094 kmp_uint64 upper = task_bounds.get_ub();
4096 kmp_info_t *thread = __kmp_threads[gtid];
4097 kmp_taskdata_t *current_task = thread->th.th_current_task;
4098 kmp_task_t *next_task;
4099 kmp_int32 lastpriv = 0;
4101 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4102 KMP_DEBUG_ASSERT(num_tasks > extras);
4103 KMP_DEBUG_ASSERT(num_tasks > 0);
4104 KA_TRACE(20, (
"__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4105 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4106 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
4110 for (i = 0; i < num_tasks; ++i) {
4111 kmp_uint64 chunk_minus_1;
4113 chunk_minus_1 = grainsize - 1;
4115 chunk_minus_1 = grainsize;
4118 upper = lower + st * chunk_minus_1;
4119 if (i == num_tasks - 1) {
4122 KMP_DEBUG_ASSERT(upper == *ub);
4123 if (upper == ub_glob)
4125 }
else if (st > 0) {
4126 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4127 if ((kmp_uint64)st > ub_glob - upper)
4130 KMP_DEBUG_ASSERT(upper + st < *ub);
4131 if (upper - ub_glob < (kmp_uint64)(-st))
4135 next_task = __kmp_task_dup_alloc(thread, task);
4136 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4137 kmp_taskloop_bounds_t next_task_bounds =
4138 kmp_taskloop_bounds_t(next_task, task_bounds);
4141 next_task_bounds.set_lb(lower);
4142 if (next_taskdata->td_flags.native) {
4143 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4145 next_task_bounds.set_ub(upper);
4147 if (ptask_dup != NULL)
4149 ptask_dup(next_task, task, lastpriv);
4151 (
"__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4152 "upper %lld stride %lld, (offsets %p %p)\n",
4153 gtid, i, next_task, lower, upper, st,
4154 next_task_bounds.get_lower_offset(),
4155 next_task_bounds.get_upper_offset()));
4157 __kmp_omp_taskloop_task(NULL, gtid, next_task,
4160 __kmp_omp_task(gtid, next_task,
true);
4165 __kmp_task_start(gtid, task, current_task);
4167 __kmp_task_finish<false>(gtid, task, current_task);
4172 typedef struct __taskloop_params {
4179 kmp_uint64 num_tasks;
4180 kmp_uint64 grainsize;
4183 kmp_uint64 num_t_min;
4187 } __taskloop_params_t;
4189 void __kmp_taskloop_recur(
ident_t *,
int, kmp_task_t *, kmp_uint64 *,
4190 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4191 kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
4198 int __kmp_taskloop_task(
int gtid,
void *ptask) {
4199 __taskloop_params_t *p =
4200 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4201 kmp_task_t *task = p->task;
4202 kmp_uint64 *lb = p->lb;
4203 kmp_uint64 *ub = p->ub;
4204 void *task_dup = p->task_dup;
4206 kmp_int64 st = p->st;
4207 kmp_uint64 ub_glob = p->ub_glob;
4208 kmp_uint64 num_tasks = p->num_tasks;
4209 kmp_uint64 grainsize = p->grainsize;
4210 kmp_uint64 extras = p->extras;
4211 kmp_uint64 tc = p->tc;
4212 kmp_uint64 num_t_min = p->num_t_min;
4214 void *codeptr_ra = p->codeptr_ra;
4217 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4218 KMP_DEBUG_ASSERT(task != NULL);
4219 KA_TRACE(20, (
"__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4220 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4221 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4224 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4225 if (num_tasks > num_t_min)
4226 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4227 grainsize, extras, tc, num_t_min,
4233 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4234 grainsize, extras, tc,
4240 KA_TRACE(40, (
"__kmp_taskloop_task(exit): T#%d\n", gtid));
4261 void __kmp_taskloop_recur(
ident_t *loc,
int gtid, kmp_task_t *task,
4262 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4263 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4264 kmp_uint64 grainsize, kmp_uint64 extras,
4265 kmp_uint64 tc, kmp_uint64 num_t_min,
4271 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4272 KMP_DEBUG_ASSERT(task != NULL);
4273 KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4274 KA_TRACE(20, (
"__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4275 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4276 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4279 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4280 kmp_uint64 lower = *lb;
4281 kmp_info_t *thread = __kmp_threads[gtid];
4283 kmp_task_t *next_task;
4284 size_t lower_offset =
4285 (
char *)lb - (
char *)task;
4286 size_t upper_offset =
4287 (
char *)ub - (
char *)task;
4289 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4290 KMP_DEBUG_ASSERT(num_tasks > extras);
4291 KMP_DEBUG_ASSERT(num_tasks > 0);
4294 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4295 kmp_uint64 gr_size0 = grainsize;
4296 kmp_uint64 n_tsk0 = num_tasks >> 1;
4297 kmp_uint64 n_tsk1 = num_tasks - n_tsk0;
4298 if (n_tsk0 <= extras) {
4301 ext1 = extras - n_tsk0;
4302 tc0 = gr_size0 * n_tsk0;
4307 tc1 = grainsize * n_tsk1;
4310 ub0 = lower + st * (tc0 - 1);
4314 next_task = __kmp_task_dup_alloc(thread, task);
4316 *(kmp_uint64 *)((
char *)next_task + lower_offset) = lb1;
4317 if (ptask_dup != NULL)
4318 ptask_dup(next_task, task, 0);
4322 kmp_task_t *new_task =
4323 __kmpc_omp_task_alloc(loc, gtid, 1, 3 *
sizeof(
void *),
4324 sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4325 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4326 p->task = next_task;
4327 p->lb = (kmp_uint64 *)((
char *)next_task + lower_offset);
4328 p->ub = (kmp_uint64 *)((
char *)next_task + upper_offset);
4329 p->task_dup = task_dup;
4331 p->ub_glob = ub_glob;
4332 p->num_tasks = n_tsk1;
4333 p->grainsize = grainsize;
4336 p->num_t_min = num_t_min;
4338 p->codeptr_ra = codeptr_ra;
4343 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4345 __kmp_omp_task(gtid, new_task,
true);
4349 if (n_tsk0 > num_t_min)
4350 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4351 ext0, tc0, num_t_min,
4357 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4358 gr_size0, ext0, tc0,
4364 KA_TRACE(40, (
"__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4384 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
int nogroup,
4385 int sched, kmp_uint64 grainsize,
void *task_dup) {
4386 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4387 KMP_DEBUG_ASSERT(task != NULL);
4390 #if OMPT_SUPPORT && OMPT_OPTIONAL
4391 OMPT_STORE_RETURN_ADDRESS(gtid);
4393 __kmpc_taskgroup(loc, gtid);
4398 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4401 kmp_uint64 lower = task_bounds.get_lb();
4402 kmp_uint64 upper = task_bounds.get_ub();
4403 kmp_uint64 ub_glob = upper;
4404 kmp_uint64 num_tasks = 0, extras = 0;
4405 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4406 kmp_info_t *thread = __kmp_threads[gtid];
4407 kmp_taskdata_t *current_task = thread->th.th_current_task;
4409 KA_TRACE(20, (
"__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4410 "grain %llu(%d), dup %p\n",
4411 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4415 tc = upper - lower + 1;
4416 }
else if (st < 0) {
4417 tc = (lower - upper) / (-st) + 1;
4419 tc = (upper - lower) / st + 1;
4422 KA_TRACE(20, (
"__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4424 __kmp_task_start(gtid, task, current_task);
4426 __kmp_task_finish<false>(gtid, task, current_task);
4430 #if OMPT_SUPPORT && OMPT_OPTIONAL
4431 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4432 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4433 if (ompt_enabled.ompt_callback_work) {
4434 ompt_callbacks.ompt_callback(ompt_callback_work)(
4435 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4436 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4440 if (num_tasks_min == 0)
4443 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4449 grainsize = thread->th.th_team_nproc * 10;
4452 if (grainsize > tc) {
4457 num_tasks = grainsize;
4458 grainsize = tc / num_tasks;
4459 extras = tc % num_tasks;
4463 if (grainsize > tc) {
4468 num_tasks = tc / grainsize;
4470 grainsize = tc / num_tasks;
4471 extras = tc % num_tasks;
4475 KMP_ASSERT2(0,
"unknown scheduling of taskloop");
4477 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4478 KMP_DEBUG_ASSERT(num_tasks > extras);
4479 KMP_DEBUG_ASSERT(num_tasks > 0);
4485 taskdata->td_flags.task_serial = 1;
4486 taskdata->td_flags.tiedness = TASK_TIED;
4488 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4489 grainsize, extras, tc,
4491 OMPT_GET_RETURN_ADDRESS(0),
4496 }
else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4497 KA_TRACE(20, (
"__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4498 "(%lld), grain %llu, extras %llu\n",
4499 gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4500 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4501 grainsize, extras, tc, num_tasks_min,
4503 OMPT_GET_RETURN_ADDRESS(0),
4507 KA_TRACE(20, (
"__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4508 "(%lld), grain %llu, extras %llu\n",
4509 gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4510 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4511 grainsize, extras, tc,
4513 OMPT_GET_RETURN_ADDRESS(0),
4518 #if OMPT_SUPPORT && OMPT_OPTIONAL
4519 if (ompt_enabled.ompt_callback_work) {
4520 ompt_callbacks.ompt_callback(ompt_callback_work)(
4521 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4522 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4527 #if OMPT_SUPPORT && OMPT_OPTIONAL
4528 OMPT_STORE_RETURN_ADDRESS(gtid);
4530 __kmpc_end_taskgroup(loc, gtid);
4532 KA_TRACE(20, (
"__kmpc_taskloop(exit): T#%d\n", gtid));