LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_i18n.h"
16 #include "kmp_itt.h"
17 #include "kmp_stats.h"
18 #include "kmp_wait_release.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #include "tsan_annotations.h"
25 
26 /* forward declaration */
27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
28  kmp_info_t *this_thr);
29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
30  kmp_thread_data_t *thread_data);
31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
32  kmp_task_team_t *task_team);
33 
34 #ifdef OMP_45_ENABLED
35 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
36 #endif
37 
38 #ifdef BUILD_TIED_TASK_STACK
39 
40 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
41 // from top do bottom
42 //
43 // gtid: global thread identifier for thread containing stack
44 // thread_data: thread data for task team thread containing stack
45 // threshold: value above which the trace statement triggers
46 // location: string identifying call site of this function (for trace)
47 static void __kmp_trace_task_stack(kmp_int32 gtid,
48  kmp_thread_data_t *thread_data,
49  int threshold, char *location) {
50  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
51  kmp_taskdata_t **stack_top = task_stack->ts_top;
52  kmp_int32 entries = task_stack->ts_entries;
53  kmp_taskdata_t *tied_task;
54 
55  KA_TRACE(
56  threshold,
57  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
58  "first_block = %p, stack_top = %p \n",
59  location, gtid, entries, task_stack->ts_first_block, stack_top));
60 
61  KMP_DEBUG_ASSERT(stack_top != NULL);
62  KMP_DEBUG_ASSERT(entries > 0);
63 
64  while (entries != 0) {
65  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
66  // fix up ts_top if we need to pop from previous block
67  if (entries & TASK_STACK_INDEX_MASK == 0) {
68  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
69 
70  stack_block = stack_block->sb_prev;
71  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
72  }
73 
74  // finish bookkeeping
75  stack_top--;
76  entries--;
77 
78  tied_task = *stack_top;
79 
80  KMP_DEBUG_ASSERT(tied_task != NULL);
81  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
82 
83  KA_TRACE(threshold,
84  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
85  "stack_top=%p, tied_task=%p\n",
86  location, gtid, entries, stack_top, tied_task));
87  }
88  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
89 
90  KA_TRACE(threshold,
91  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
92  location, gtid));
93 }
94 
95 // __kmp_init_task_stack: initialize the task stack for the first time
96 // after a thread_data structure is created.
97 // It should not be necessary to do this again (assuming the stack works).
98 //
99 // gtid: global thread identifier of calling thread
100 // thread_data: thread data for task team thread containing stack
101 static void __kmp_init_task_stack(kmp_int32 gtid,
102  kmp_thread_data_t *thread_data) {
103  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
104  kmp_stack_block_t *first_block;
105 
106  // set up the first block of the stack
107  first_block = &task_stack->ts_first_block;
108  task_stack->ts_top = (kmp_taskdata_t **)first_block;
109  memset((void *)first_block, '\0',
110  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
111 
112  // initialize the stack to be empty
113  task_stack->ts_entries = TASK_STACK_EMPTY;
114  first_block->sb_next = NULL;
115  first_block->sb_prev = NULL;
116 }
117 
118 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
119 //
120 // gtid: global thread identifier for calling thread
121 // thread_data: thread info for thread containing stack
122 static void __kmp_free_task_stack(kmp_int32 gtid,
123  kmp_thread_data_t *thread_data) {
124  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
125  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
126 
127  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
128  // free from the second block of the stack
129  while (stack_block != NULL) {
130  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
131 
132  stack_block->sb_next = NULL;
133  stack_block->sb_prev = NULL;
134  if (stack_block != &task_stack->ts_first_block) {
135  __kmp_thread_free(thread,
136  stack_block); // free the block, if not the first
137  }
138  stack_block = next_block;
139  }
140  // initialize the stack to be empty
141  task_stack->ts_entries = 0;
142  task_stack->ts_top = NULL;
143 }
144 
145 // __kmp_push_task_stack: Push the tied task onto the task stack.
146 // Grow the stack if necessary by allocating another block.
147 //
148 // gtid: global thread identifier for calling thread
149 // thread: thread info for thread containing stack
150 // tied_task: the task to push on the stack
151 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
152  kmp_taskdata_t *tied_task) {
153  // GEH - need to consider what to do if tt_threads_data not allocated yet
154  kmp_thread_data_t *thread_data =
155  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
156  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
157 
158  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
159  return; // Don't push anything on stack if team or team tasks are serialized
160  }
161 
162  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
163  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
164 
165  KA_TRACE(20,
166  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
167  gtid, thread, tied_task));
168  // Store entry
169  *(task_stack->ts_top) = tied_task;
170 
171  // Do bookkeeping for next push
172  task_stack->ts_top++;
173  task_stack->ts_entries++;
174 
175  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
176  // Find beginning of this task block
177  kmp_stack_block_t *stack_block =
178  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
179 
180  // Check if we already have a block
181  if (stack_block->sb_next !=
182  NULL) { // reset ts_top to beginning of next block
183  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
184  } else { // Alloc new block and link it up
185  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
186  thread, sizeof(kmp_stack_block_t));
187 
188  task_stack->ts_top = &new_block->sb_block[0];
189  stack_block->sb_next = new_block;
190  new_block->sb_prev = stack_block;
191  new_block->sb_next = NULL;
192 
193  KA_TRACE(
194  30,
195  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
196  gtid, tied_task, new_block));
197  }
198  }
199  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
200  tied_task));
201 }
202 
203 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
204 // the task, just check to make sure it matches the ending task passed in.
205 //
206 // gtid: global thread identifier for the calling thread
207 // thread: thread info structure containing stack
208 // tied_task: the task popped off the stack
209 // ending_task: the task that is ending (should match popped task)
210 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
211  kmp_taskdata_t *ending_task) {
212  // GEH - need to consider what to do if tt_threads_data not allocated yet
213  kmp_thread_data_t *thread_data =
214  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
215  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
216  kmp_taskdata_t *tied_task;
217 
218  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
219  // Don't pop anything from stack if team or team tasks are serialized
220  return;
221  }
222 
223  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
224  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
225 
226  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
227  thread));
228 
229  // fix up ts_top if we need to pop from previous block
230  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
231  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
232 
233  stack_block = stack_block->sb_prev;
234  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
235  }
236 
237  // finish bookkeeping
238  task_stack->ts_top--;
239  task_stack->ts_entries--;
240 
241  tied_task = *(task_stack->ts_top);
242 
243  KMP_DEBUG_ASSERT(tied_task != NULL);
244  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
245  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
246 
247  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
248  tied_task));
249  return;
250 }
251 #endif /* BUILD_TIED_TASK_STACK */
252 
253 // __kmp_push_task: Add a task to the thread's deque
254 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
255  kmp_info_t *thread = __kmp_threads[gtid];
256  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
257  kmp_task_team_t *task_team = thread->th.th_task_team;
258  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
259  kmp_thread_data_t *thread_data;
260 
261  KA_TRACE(20,
262  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
263 
264  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
265  // untied task needs to increment counter so that the task structure is not
266  // freed prematurely
267  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
268  KA_TRACE(
269  20,
270  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
271  gtid, counter, taskdata));
272  }
273 
274  // The first check avoids building task_team thread data if serialized
275  if (taskdata->td_flags.task_serial) {
276  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
277  "TASK_NOT_PUSHED for task %p\n",
278  gtid, taskdata));
279  return TASK_NOT_PUSHED;
280  }
281 
282  // Now that serialized tasks have returned, we can assume that we are not in
283  // immediate exec mode
284  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
285  if (!KMP_TASKING_ENABLED(task_team)) {
286  __kmp_enable_tasking(task_team, thread);
287  }
288  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
289  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
290 
291  // Find tasking deque specific to encountering thread
292  thread_data = &task_team->tt.tt_threads_data[tid];
293 
294  // No lock needed since only owner can allocate
295  if (thread_data->td.td_deque == NULL) {
296  __kmp_alloc_task_deque(thread, thread_data);
297  }
298 
299  // Check if deque is full
300  if (TCR_4(thread_data->td.td_deque_ntasks) >=
301  TASK_DEQUE_SIZE(thread_data->td)) {
302  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
303  "TASK_NOT_PUSHED for task %p\n",
304  gtid, taskdata));
305  return TASK_NOT_PUSHED;
306  }
307 
308  // Lock the deque for the task push operation
309  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
310 
311 #if OMP_45_ENABLED
312  // Need to recheck as we can get a proxy task from a thread outside of OpenMP
313  if (TCR_4(thread_data->td.td_deque_ntasks) >=
314  TASK_DEQUE_SIZE(thread_data->td)) {
315  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
316  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning "
317  "TASK_NOT_PUSHED for task %p\n",
318  gtid, taskdata));
319  return TASK_NOT_PUSHED;
320  }
321 #else
322  // Must have room since no thread can add tasks but calling thread
323  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
324  TASK_DEQUE_SIZE(thread_data->td));
325 #endif
326 
327  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
328  taskdata; // Push taskdata
329  // Wrap index.
330  thread_data->td.td_deque_tail =
331  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
332  TCW_4(thread_data->td.td_deque_ntasks,
333  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
334 
335  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
336  "task=%p ntasks=%d head=%u tail=%u\n",
337  gtid, taskdata, thread_data->td.td_deque_ntasks,
338  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
339 
340  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
341 
342  return TASK_SUCCESSFULLY_PUSHED;
343 }
344 
345 // __kmp_pop_current_task_from_thread: set up current task from called thread
346 // when team ends
347 //
348 // this_thr: thread structure to set current_task in.
349 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
350  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
351  "this_thread=%p, curtask=%p, "
352  "curtask_parent=%p\n",
353  0, this_thr, this_thr->th.th_current_task,
354  this_thr->th.th_current_task->td_parent));
355 
356  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
357 
358  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
359  "this_thread=%p, curtask=%p, "
360  "curtask_parent=%p\n",
361  0, this_thr, this_thr->th.th_current_task,
362  this_thr->th.th_current_task->td_parent));
363 }
364 
365 // __kmp_push_current_task_to_thread: set up current task in called thread for a
366 // new team
367 //
368 // this_thr: thread structure to set up
369 // team: team for implicit task data
370 // tid: thread within team to set up
371 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
372  int tid) {
373  // current task of the thread is a parent of the new just created implicit
374  // tasks of new team
375  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
376  "curtask=%p "
377  "parent_task=%p\n",
378  tid, this_thr, this_thr->th.th_current_task,
379  team->t.t_implicit_task_taskdata[tid].td_parent));
380 
381  KMP_DEBUG_ASSERT(this_thr != NULL);
382 
383  if (tid == 0) {
384  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
385  team->t.t_implicit_task_taskdata[0].td_parent =
386  this_thr->th.th_current_task;
387  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
388  }
389  } else {
390  team->t.t_implicit_task_taskdata[tid].td_parent =
391  team->t.t_implicit_task_taskdata[0].td_parent;
392  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
393  }
394 
395  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
396  "curtask=%p "
397  "parent_task=%p\n",
398  tid, this_thr, this_thr->th.th_current_task,
399  team->t.t_implicit_task_taskdata[tid].td_parent));
400 }
401 
402 // __kmp_task_start: bookkeeping for a task starting execution
403 //
404 // GTID: global thread id of calling thread
405 // task: task starting execution
406 // current_task: task suspending
407 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
408  kmp_taskdata_t *current_task) {
409  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
410  kmp_info_t *thread = __kmp_threads[gtid];
411 
412  KA_TRACE(10,
413  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
414  gtid, taskdata, current_task));
415 
416  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
417 
418  // mark currently executing task as suspended
419  // TODO: GEH - make sure root team implicit task is initialized properly.
420  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
421  current_task->td_flags.executing = 0;
422 
423 // Add task to stack if tied
424 #ifdef BUILD_TIED_TASK_STACK
425  if (taskdata->td_flags.tiedness == TASK_TIED) {
426  __kmp_push_task_stack(gtid, thread, taskdata);
427  }
428 #endif /* BUILD_TIED_TASK_STACK */
429 
430  // mark starting task as executing and as current task
431  thread->th.th_current_task = taskdata;
432 
433  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
434  taskdata->td_flags.tiedness == TASK_UNTIED);
435  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
436  taskdata->td_flags.tiedness == TASK_UNTIED);
437  taskdata->td_flags.started = 1;
438  taskdata->td_flags.executing = 1;
439  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
440  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
441 
442  // GEH TODO: shouldn't we pass some sort of location identifier here?
443  // APT: yes, we will pass location here.
444  // need to store current thread state (in a thread or taskdata structure)
445  // before setting work_state, otherwise wrong state is set after end of task
446 
447  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
448 
449  return;
450 }
451 
452 #if OMPT_SUPPORT
453 //------------------------------------------------------------------------------
454 // __ompt_task_init:
455 // Initialize OMPT fields maintained by a task. This will only be called after
456 // ompt_start_tool, so we already know whether ompt is enabled or not.
457 
458 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
459  // The calls to __ompt_task_init already have the ompt_enabled condition.
460  task->ompt_task_info.task_data.value = 0;
461  task->ompt_task_info.frame.exit_frame = NULL;
462  task->ompt_task_info.frame.enter_frame = NULL;
463 #if OMP_40_ENABLED
464  task->ompt_task_info.ndeps = 0;
465  task->ompt_task_info.deps = NULL;
466 #endif /* OMP_40_ENABLED */
467 }
468 
469 // __ompt_task_start:
470 // Build and trigger task-begin event
471 static inline void __ompt_task_start(kmp_task_t *task,
472  kmp_taskdata_t *current_task,
473  kmp_int32 gtid) {
474  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
475  ompt_task_status_t status = ompt_task_others;
476  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
477  status = ompt_task_yield;
478  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
479  }
480  /* let OMPT know that we're about to run this task */
481  if (ompt_enabled.ompt_callback_task_schedule) {
482  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
483  &(current_task->ompt_task_info.task_data), status,
484  &(taskdata->ompt_task_info.task_data));
485  }
486  taskdata->ompt_task_info.scheduling_parent = current_task;
487 }
488 
489 // __ompt_task_finish:
490 // Build and trigger final task-schedule event
491 static inline void
492 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
493  ompt_task_status_t status = ompt_task_complete) {
494  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
495  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
496  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
497  status = ompt_task_cancel;
498  }
499 
500  /* let OMPT know that we're returning to the callee task */
501  if (ompt_enabled.ompt_callback_task_schedule) {
502  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
503  &(taskdata->ompt_task_info.task_data), status,
504  &((resumed_task ? resumed_task
505  : (taskdata->ompt_task_info.scheduling_parent
506  ? taskdata->ompt_task_info.scheduling_parent
507  : taskdata->td_parent))
508  ->ompt_task_info.task_data));
509  }
510 }
511 #endif
512 
513 template <bool ompt>
514 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
515  kmp_task_t *task,
516  void *frame_address,
517  void *return_address) {
518  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
519  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
520 
521  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
522  "current_task=%p\n",
523  gtid, loc_ref, taskdata, current_task));
524 
525  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
526  // untied task needs to increment counter so that the task structure is not
527  // freed prematurely
528  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
529  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
530  "incremented for task %p\n",
531  gtid, counter, taskdata));
532  }
533 
534  taskdata->td_flags.task_serial =
535  1; // Execute this task immediately, not deferred.
536  __kmp_task_start(gtid, task, current_task);
537 
538 #if OMPT_SUPPORT
539  if (ompt) {
540  if (current_task->ompt_task_info.frame.enter_frame == NULL) {
541  current_task->ompt_task_info.frame.enter_frame =
542  taskdata->ompt_task_info.frame.exit_frame = frame_address;
543  }
544  if (ompt_enabled.ompt_callback_task_create) {
545  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
546  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
547  &(parent_info->task_data), &(parent_info->frame),
548  &(taskdata->ompt_task_info.task_data),
549  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
550  return_address);
551  }
552  __ompt_task_start(task, current_task, gtid);
553  }
554 #endif // OMPT_SUPPORT
555 
556  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
557  loc_ref, taskdata));
558 }
559 
560 #if OMPT_SUPPORT
561 OMPT_NOINLINE
562 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
563  kmp_task_t *task,
564  void *frame_address,
565  void *return_address) {
566  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
567  return_address);
568 }
569 #endif // OMPT_SUPPORT
570 
571 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
572 // execution
573 //
574 // loc_ref: source location information; points to beginning of task block.
575 // gtid: global thread number.
576 // task: task thunk for the started task.
577 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
578  kmp_task_t *task) {
579 #if OMPT_SUPPORT
580  if (UNLIKELY(ompt_enabled.enabled)) {
581  OMPT_STORE_RETURN_ADDRESS(gtid);
582  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
583  OMPT_GET_FRAME_ADDRESS(1),
584  OMPT_LOAD_RETURN_ADDRESS(gtid));
585  return;
586  }
587 #endif
588  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
589 }
590 
591 #ifdef TASK_UNUSED
592 // __kmpc_omp_task_begin: report that a given task has started execution
593 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
594 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
595  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
596 
597  KA_TRACE(
598  10,
599  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
600  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
601 
602  __kmp_task_start(gtid, task, current_task);
603 
604  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
605  loc_ref, KMP_TASK_TO_TASKDATA(task)));
606  return;
607 }
608 #endif // TASK_UNUSED
609 
610 // __kmp_free_task: free the current task space and the space for shareds
611 //
612 // gtid: Global thread ID of calling thread
613 // taskdata: task to free
614 // thread: thread data structure of caller
615 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
616  kmp_info_t *thread) {
617  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
618  taskdata));
619 
620  // Check to make sure all flags and counters have the correct values
621  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
622  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
623  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
624  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
625  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
626  taskdata->td_flags.task_serial == 1);
627  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
628 
629  taskdata->td_flags.freed = 1;
630  ANNOTATE_HAPPENS_BEFORE(taskdata);
631 // deallocate the taskdata and shared variable blocks associated with this task
632 #if USE_FAST_MEMORY
633  __kmp_fast_free(thread, taskdata);
634 #else /* ! USE_FAST_MEMORY */
635  __kmp_thread_free(thread, taskdata);
636 #endif
637 
638  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
639 }
640 
641 // __kmp_free_task_and_ancestors: free the current task and ancestors without
642 // children
643 //
644 // gtid: Global thread ID of calling thread
645 // taskdata: task to free
646 // thread: thread data structure of caller
647 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
648  kmp_taskdata_t *taskdata,
649  kmp_info_t *thread) {
650 #if OMP_45_ENABLED
651  // Proxy tasks must always be allowed to free their parents
652  // because they can be run in background even in serial mode.
653  kmp_int32 team_serial =
654  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
655  !taskdata->td_flags.proxy;
656 #else
657  kmp_int32 team_serial =
658  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
659 #endif
660  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
661 
662  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
663  KMP_DEBUG_ASSERT(children >= 0);
664 
665  // Now, go up the ancestor tree to see if any ancestors can now be freed.
666  while (children == 0) {
667  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
668 
669  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
670  "and freeing itself\n",
671  gtid, taskdata));
672 
673  // --- Deallocate my ancestor task ---
674  __kmp_free_task(gtid, taskdata, thread);
675 
676  taskdata = parent_taskdata;
677 
678  // Stop checking ancestors at implicit task instead of walking up ancestor
679  // tree to avoid premature deallocation of ancestors.
680  if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT)
681  return;
682 
683  // Predecrement simulated by "- 1" calculation
684  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
685  KMP_DEBUG_ASSERT(children >= 0);
686  }
687 
688  KA_TRACE(
689  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
690  "not freeing it yet\n",
691  gtid, taskdata, children));
692 }
693 
694 // __kmp_task_finish: bookkeeping to do when a task finishes execution
695 //
696 // gtid: global thread ID for calling thread
697 // task: task to be finished
698 // resumed_task: task to be resumed. (may be NULL if task is serialized)
699 template <bool ompt>
700 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
701  kmp_taskdata_t *resumed_task) {
702  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
703  kmp_info_t *thread = __kmp_threads[gtid];
704  kmp_task_team_t *task_team =
705  thread->th.th_task_team; // might be NULL for serial teams...
706  kmp_int32 children = 0;
707 
708  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
709  "task %p\n",
710  gtid, taskdata, resumed_task));
711 
712  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
713 
714 // Pop task from stack if tied
715 #ifdef BUILD_TIED_TASK_STACK
716  if (taskdata->td_flags.tiedness == TASK_TIED) {
717  __kmp_pop_task_stack(gtid, thread, taskdata);
718  }
719 #endif /* BUILD_TIED_TASK_STACK */
720 
721  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
722  // untied task needs to check the counter so that the task structure is not
723  // freed prematurely
724  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
725  KA_TRACE(
726  20,
727  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
728  gtid, counter, taskdata));
729  if (counter > 0) {
730  // untied task is not done, to be continued possibly by other thread, do
731  // not free it now
732  if (resumed_task == NULL) {
733  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
734  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
735  // task is the parent
736  }
737  thread->th.th_current_task = resumed_task; // restore current_task
738  resumed_task->td_flags.executing = 1; // resume previous task
739  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
740  "resuming task %p\n",
741  gtid, taskdata, resumed_task));
742  return;
743  }
744  }
745 #if OMPT_SUPPORT
746  if (ompt)
747  __ompt_task_finish(task, resumed_task);
748 #endif
749 
750  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
751  taskdata->td_flags.complete = 1; // mark the task as completed
752  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
753  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
754 
755  // Only need to keep track of count if team parallel and tasking not
756  // serialized
757  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
758  // Predecrement simulated by "- 1" calculation
759  children =
760  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
761  KMP_DEBUG_ASSERT(children >= 0);
762 #if OMP_40_ENABLED
763  if (taskdata->td_taskgroup)
764  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
765 #if OMP_45_ENABLED
766  }
767  // if we found proxy tasks there could exist a dependency chain
768  // with the proxy task as origin
769  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
770  (task_team && task_team->tt.tt_found_proxy_tasks)) {
771 #endif
772  __kmp_release_deps(gtid, taskdata);
773 #endif
774  }
775 
776  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
777  // called. Othertwise, if a task is executed immediately from the release_deps
778  // code, the flag will be reset to 1 again by this same function
779  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
780  taskdata->td_flags.executing = 0; // suspend the finishing task
781 
782  KA_TRACE(
783  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
784  gtid, taskdata, children));
785 
786 #if OMP_40_ENABLED
787  /* If the tasks' destructor thunk flag has been set, we need to invoke the
788  destructor thunk that has been generated by the compiler. The code is
789  placed here, since at this point other tasks might have been released
790  hence overlapping the destructor invokations with some other work in the
791  released tasks. The OpenMP spec is not specific on when the destructors
792  are invoked, so we should be free to choose. */
793  if (taskdata->td_flags.destructors_thunk) {
794  kmp_routine_entry_t destr_thunk = task->data1.destructors;
795  KMP_ASSERT(destr_thunk);
796  destr_thunk(gtid, task);
797  }
798 #endif // OMP_40_ENABLED
799 
800  // bookkeeping for resuming task:
801  // GEH - note tasking_ser => task_serial
802  KMP_DEBUG_ASSERT(
803  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
804  taskdata->td_flags.task_serial);
805  if (taskdata->td_flags.task_serial) {
806  if (resumed_task == NULL) {
807  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
808  // task is the parent
809  }
810  } else {
811  KMP_DEBUG_ASSERT(resumed_task !=
812  NULL); // verify that resumed task is passed as arguemnt
813  }
814 
815  // Free this task and then ancestor tasks if they have no children.
816  // Restore th_current_task first as suggested by John:
817  // johnmc: if an asynchronous inquiry peers into the runtime system
818  // it doesn't see the freed task as the current task.
819  thread->th.th_current_task = resumed_task;
820  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
821 
822  // TODO: GEH - make sure root team implicit task is initialized properly.
823  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
824  resumed_task->td_flags.executing = 1; // resume previous task
825 
826  KA_TRACE(
827  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
828  gtid, taskdata, resumed_task));
829 
830  return;
831 }
832 
833 template <bool ompt>
834 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
835  kmp_int32 gtid,
836  kmp_task_t *task) {
837  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
838  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
839  // this routine will provide task to resume
840  __kmp_task_finish<ompt>(gtid, task, NULL);
841 
842  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
843  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
844 
845 #if OMPT_SUPPORT
846  if (ompt) {
847  omp_frame_t *ompt_frame;
848  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
849  ompt_frame->enter_frame = NULL;
850  }
851 #endif
852 
853  return;
854 }
855 
856 #if OMPT_SUPPORT
857 OMPT_NOINLINE
858 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
859  kmp_task_t *task) {
860  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
861 }
862 #endif // OMPT_SUPPORT
863 
864 // __kmpc_omp_task_complete_if0: report that a task has completed execution
865 //
866 // loc_ref: source location information; points to end of task block.
867 // gtid: global thread number.
868 // task: task thunk for the completed task.
869 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
870  kmp_task_t *task) {
871 #if OMPT_SUPPORT
872  if (UNLIKELY(ompt_enabled.enabled)) {
873  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
874  return;
875  }
876 #endif
877  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
878 }
879 
880 #ifdef TASK_UNUSED
881 // __kmpc_omp_task_complete: report that a task has completed execution
882 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
883 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
884  kmp_task_t *task) {
885  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
886  loc_ref, KMP_TASK_TO_TASKDATA(task)));
887 
888  __kmp_task_finish<false>(gtid, task,
889  NULL); // Not sure how to find task to resume
890 
891  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
892  loc_ref, KMP_TASK_TO_TASKDATA(task)));
893  return;
894 }
895 #endif // TASK_UNUSED
896 
897 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
898 // task for a given thread
899 //
900 // loc_ref: reference to source location of parallel region
901 // this_thr: thread data structure corresponding to implicit task
902 // team: team for this_thr
903 // tid: thread id of given thread within team
904 // set_curr_task: TRUE if need to push current task to thread
905 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
906 // have already been done elsewhere.
907 // TODO: Get better loc_ref. Value passed in may be NULL
908 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
909  kmp_team_t *team, int tid, int set_curr_task) {
910  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
911 
912  KF_TRACE(
913  10,
914  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
915  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
916 
917  task->td_task_id = KMP_GEN_TASK_ID();
918  task->td_team = team;
919  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
920  // in debugger)
921  task->td_ident = loc_ref;
922  task->td_taskwait_ident = NULL;
923  task->td_taskwait_counter = 0;
924  task->td_taskwait_thread = 0;
925 
926  task->td_flags.tiedness = TASK_TIED;
927  task->td_flags.tasktype = TASK_IMPLICIT;
928 #if OMP_45_ENABLED
929  task->td_flags.proxy = TASK_FULL;
930 #endif
931 
932  // All implicit tasks are executed immediately, not deferred
933  task->td_flags.task_serial = 1;
934  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
935  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
936 
937  task->td_flags.started = 1;
938  task->td_flags.executing = 1;
939  task->td_flags.complete = 0;
940  task->td_flags.freed = 0;
941 
942 #if OMP_40_ENABLED
943  task->td_depnode = NULL;
944 #endif
945  task->td_last_tied = task;
946 
947  if (set_curr_task) { // only do this init first time thread is created
948  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
949  // Not used: don't need to deallocate implicit task
950  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
951 #if OMP_40_ENABLED
952  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
953  task->td_dephash = NULL;
954 #endif
955  __kmp_push_current_task_to_thread(this_thr, team, tid);
956  } else {
957  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
958  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
959  }
960 
961 #if OMPT_SUPPORT
962  if (UNLIKELY(ompt_enabled.enabled))
963  __ompt_task_init(task, tid);
964 #endif
965 
966  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
967  team, task));
968 }
969 
970 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
971 // at the end of parallel regions. Some resources are kept for reuse in the next
972 // parallel region.
973 //
974 // thread: thread data structure corresponding to implicit task
975 void __kmp_finish_implicit_task(kmp_info_t *thread) {
976  kmp_taskdata_t *task = thread->th.th_current_task;
977  if (task->td_dephash)
978  __kmp_dephash_free_entries(thread, task->td_dephash);
979 }
980 
981 // __kmp_free_implicit_task: Release resources associated to implicit tasks
982 // when these are destroyed regions
983 //
984 // thread: thread data structure corresponding to implicit task
985 void __kmp_free_implicit_task(kmp_info_t *thread) {
986  kmp_taskdata_t *task = thread->th.th_current_task;
987  if (task && task->td_dephash) {
988  __kmp_dephash_free(thread, task->td_dephash);
989  task->td_dephash = NULL;
990  }
991 }
992 
993 // Round up a size to a power of two specified by val: Used to insert padding
994 // between structures co-allocated using a single malloc() call
995 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
996  if (size & (val - 1)) {
997  size &= ~(val - 1);
998  if (size <= KMP_SIZE_T_MAX - val) {
999  size += val; // Round up if there is no overflow.
1000  }
1001  }
1002  return size;
1003 } // __kmp_round_up_to_va
1004 
1005 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1006 //
1007 // loc_ref: source location information
1008 // gtid: global thread number.
1009 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1010 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1011 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1012 // private vars accessed in task.
1013 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1014 // in task.
1015 // task_entry: Pointer to task code entry point generated by compiler.
1016 // returns: a pointer to the allocated kmp_task_t structure (task).
1017 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1018  kmp_tasking_flags_t *flags,
1019  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1020  kmp_routine_entry_t task_entry) {
1021  kmp_task_t *task;
1022  kmp_taskdata_t *taskdata;
1023  kmp_info_t *thread = __kmp_threads[gtid];
1024  kmp_team_t *team = thread->th.th_team;
1025  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1026  size_t shareds_offset;
1027 
1028  if (!TCR_4(__kmp_init_middle))
1029  __kmp_middle_initialize();
1030 
1031  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1032  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1033  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1034  sizeof_shareds, task_entry));
1035 
1036  if (parent_task->td_flags.final) {
1037  if (flags->merged_if0) {
1038  }
1039  flags->final = 1;
1040  }
1041  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1042  // Untied task encountered causes the TSC algorithm to check entire deque of
1043  // the victim thread. If no untied task encountered, then checking the head
1044  // of the deque should be enough.
1045  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1046  }
1047 
1048 #if OMP_45_ENABLED
1049  if (flags->proxy == TASK_PROXY) {
1050  flags->tiedness = TASK_UNTIED;
1051  flags->merged_if0 = 1;
1052 
1053  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1054  tasking support enabled */
1055  if ((thread->th.th_task_team) == NULL) {
1056  /* This should only happen if the team is serialized
1057  setup a task team and propagate it to the thread */
1058  KMP_DEBUG_ASSERT(team->t.t_serialized);
1059  KA_TRACE(30,
1060  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1061  gtid));
1062  __kmp_task_team_setup(
1063  thread, team,
1064  1); // 1 indicates setup the current team regardless of nthreads
1065  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1066  }
1067  kmp_task_team_t *task_team = thread->th.th_task_team;
1068 
1069  /* tasking must be enabled now as the task might not be pushed */
1070  if (!KMP_TASKING_ENABLED(task_team)) {
1071  KA_TRACE(
1072  30,
1073  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1074  __kmp_enable_tasking(task_team, thread);
1075  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1076  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1077  // No lock needed since only owner can allocate
1078  if (thread_data->td.td_deque == NULL) {
1079  __kmp_alloc_task_deque(thread, thread_data);
1080  }
1081  }
1082 
1083  if (task_team->tt.tt_found_proxy_tasks == FALSE)
1084  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1085  }
1086 #endif
1087 
1088  // Calculate shared structure offset including padding after kmp_task_t struct
1089  // to align pointers in shared struct
1090  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1091  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1092 
1093  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1094  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1095  shareds_offset));
1096  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1097  sizeof_shareds));
1098 
1099 // Avoid double allocation here by combining shareds with taskdata
1100 #if USE_FAST_MEMORY
1101  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1102  sizeof_shareds);
1103 #else /* ! USE_FAST_MEMORY */
1104  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1105  sizeof_shareds);
1106 #endif /* USE_FAST_MEMORY */
1107  ANNOTATE_HAPPENS_AFTER(taskdata);
1108 
1109  task = KMP_TASKDATA_TO_TASK(taskdata);
1110 
1111 // Make sure task & taskdata are aligned appropriately
1112 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1113  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1114  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1115 #else
1116  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1117  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1118 #endif
1119  if (sizeof_shareds > 0) {
1120  // Avoid double allocation here by combining shareds with taskdata
1121  task->shareds = &((char *)taskdata)[shareds_offset];
1122  // Make sure shareds struct is aligned to pointer size
1123  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1124  0);
1125  } else {
1126  task->shareds = NULL;
1127  }
1128  task->routine = task_entry;
1129  task->part_id = 0; // AC: Always start with 0 part id
1130 
1131  taskdata->td_task_id = KMP_GEN_TASK_ID();
1132  taskdata->td_team = team;
1133  taskdata->td_alloc_thread = thread;
1134  taskdata->td_parent = parent_task;
1135  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1136  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1137  taskdata->td_ident = loc_ref;
1138  taskdata->td_taskwait_ident = NULL;
1139  taskdata->td_taskwait_counter = 0;
1140  taskdata->td_taskwait_thread = 0;
1141  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1142 #if OMP_45_ENABLED
1143  // avoid copying icvs for proxy tasks
1144  if (flags->proxy == TASK_FULL)
1145 #endif
1146  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1147 
1148  taskdata->td_flags.tiedness = flags->tiedness;
1149  taskdata->td_flags.final = flags->final;
1150  taskdata->td_flags.merged_if0 = flags->merged_if0;
1151 #if OMP_40_ENABLED
1152  taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1153 #endif // OMP_40_ENABLED
1154 #if OMP_45_ENABLED
1155  taskdata->td_flags.proxy = flags->proxy;
1156  taskdata->td_task_team = thread->th.th_task_team;
1157  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1158 #endif
1159  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1160 
1161  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1162  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1163 
1164  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1165  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1166 
1167  // GEH - Note we serialize the task if the team is serialized to make sure
1168  // implicit parallel region tasks are not left until program termination to
1169  // execute. Also, it helps locality to execute immediately.
1170 
1171  taskdata->td_flags.task_serial =
1172  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1173  taskdata->td_flags.tasking_ser);
1174 
1175  taskdata->td_flags.started = 0;
1176  taskdata->td_flags.executing = 0;
1177  taskdata->td_flags.complete = 0;
1178  taskdata->td_flags.freed = 0;
1179 
1180  taskdata->td_flags.native = flags->native;
1181 
1182  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1183  // start at one because counts current task and children
1184  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1185 #if OMP_40_ENABLED
1186  taskdata->td_taskgroup =
1187  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1188  taskdata->td_dephash = NULL;
1189  taskdata->td_depnode = NULL;
1190 #endif
1191  if (flags->tiedness == TASK_UNTIED)
1192  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1193  else
1194  taskdata->td_last_tied = taskdata;
1195 
1196 #if OMPT_SUPPORT
1197  if (UNLIKELY(ompt_enabled.enabled))
1198  __ompt_task_init(taskdata, gtid);
1199 #endif
1200 // Only need to keep track of child task counts if team parallel and tasking not
1201 // serialized or if it is a proxy task
1202 #if OMP_45_ENABLED
1203  if (flags->proxy == TASK_PROXY ||
1204  !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1205 #else
1206  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1207 #endif
1208  {
1209  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1210 #if OMP_40_ENABLED
1211  if (parent_task->td_taskgroup)
1212  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1213 #endif
1214  // Only need to keep track of allocated child tasks for explicit tasks since
1215  // implicit not deallocated
1216  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1217  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1218  }
1219  }
1220 
1221  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1222  gtid, taskdata, taskdata->td_parent));
1223  ANNOTATE_HAPPENS_BEFORE(task);
1224 
1225  return task;
1226 }
1227 
1228 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1229  kmp_int32 flags, size_t sizeof_kmp_task_t,
1230  size_t sizeof_shareds,
1231  kmp_routine_entry_t task_entry) {
1232  kmp_task_t *retval;
1233  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1234 
1235  input_flags->native = FALSE;
1236 // __kmp_task_alloc() sets up all other runtime flags
1237 
1238 #if OMP_45_ENABLED
1239  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1240  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1241  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1242  input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t,
1243  sizeof_shareds, task_entry));
1244 #else
1245  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1246  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1247  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1248  sizeof_kmp_task_t, sizeof_shareds, task_entry));
1249 #endif
1250 
1251  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1252  sizeof_shareds, task_entry);
1253 
1254  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1255 
1256  return retval;
1257 }
1258 
1259 // __kmp_invoke_task: invoke the specified task
1260 //
1261 // gtid: global thread ID of caller
1262 // task: the task to invoke
1263 // current_task: the task to resume after task invokation
1264 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1265  kmp_taskdata_t *current_task) {
1266  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1267  kmp_uint64 cur_time;
1268 #if OMP_40_ENABLED
1269  int discard = 0 /* false */;
1270 #endif
1271  KA_TRACE(
1272  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1273  gtid, taskdata, current_task));
1274  KMP_DEBUG_ASSERT(task);
1275 #if OMP_45_ENABLED
1276  if (taskdata->td_flags.proxy == TASK_PROXY &&
1277  taskdata->td_flags.complete == 1) {
1278  // This is a proxy task that was already completed but it needs to run
1279  // its bottom-half finish
1280  KA_TRACE(
1281  30,
1282  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1283  gtid, taskdata));
1284 
1285  __kmp_bottom_half_finish_proxy(gtid, task);
1286 
1287  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1288  "proxy task %p, resuming task %p\n",
1289  gtid, taskdata, current_task));
1290 
1291  return;
1292  }
1293 #endif
1294 
1295 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1296  if (__kmp_forkjoin_frames_mode == 3) {
1297  // Get the current time stamp to measure task execution time to correct
1298  // barrier imbalance time
1299  cur_time = __itt_get_timestamp();
1300  }
1301 #endif
1302 
1303 #if OMPT_SUPPORT
1304  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1305  // does not execute code.
1306  ompt_thread_info_t oldInfo;
1307  kmp_info_t *thread;
1308  if (UNLIKELY(ompt_enabled.enabled)) {
1309  // Store the threads states and restore them after the task
1310  thread = __kmp_threads[gtid];
1311  oldInfo = thread->th.ompt_thread_info;
1312  thread->th.ompt_thread_info.wait_id = 0;
1313  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1314  ? omp_state_work_serial
1315  : omp_state_work_parallel;
1316  taskdata->ompt_task_info.frame.exit_frame = OMPT_GET_FRAME_ADDRESS(0);
1317  }
1318 #endif
1319 
1320 #if OMP_45_ENABLED
1321  // Proxy tasks are not handled by the runtime
1322  if (taskdata->td_flags.proxy != TASK_PROXY) {
1323 #endif
1324  ANNOTATE_HAPPENS_AFTER(task);
1325  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1326 #if OMP_45_ENABLED
1327  }
1328 #endif
1329 
1330 #if OMP_40_ENABLED
1331  // TODO: cancel tasks if the parallel region has also been cancelled
1332  // TODO: check if this sequence can be hoisted above __kmp_task_start
1333  // if cancellation has been enabled for this run ...
1334  if (__kmp_omp_cancellation) {
1335  kmp_info_t *this_thr = __kmp_threads[gtid];
1336  kmp_team_t *this_team = this_thr->th.th_team;
1337  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1338  if ((taskgroup && taskgroup->cancel_request) ||
1339  (this_team->t.t_cancel_request == cancel_parallel)) {
1340 #if OMPT_SUPPORT && OMPT_OPTIONAL
1341  ompt_data_t *task_data;
1342  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1343  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1344  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1345  task_data,
1346  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1347  : ompt_cancel_parallel) |
1348  ompt_cancel_discarded_task,
1349  NULL);
1350  }
1351 #endif
1352  KMP_COUNT_BLOCK(TASK_cancelled);
1353  // this task belongs to a task group and we need to cancel it
1354  discard = 1 /* true */;
1355  }
1356  }
1357 
1358  // Invoke the task routine and pass in relevant data.
1359  // Thunks generated by gcc take a different argument list.
1360  if (!discard) {
1361  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1362  taskdata->td_last_tied = current_task->td_last_tied;
1363  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1364  }
1365 #if KMP_STATS_ENABLED
1366  KMP_COUNT_BLOCK(TASK_executed);
1367  switch (KMP_GET_THREAD_STATE()) {
1368  case FORK_JOIN_BARRIER:
1369  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1370  break;
1371  case PLAIN_BARRIER:
1372  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1373  break;
1374  case TASKYIELD:
1375  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1376  break;
1377  case TASKWAIT:
1378  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1379  break;
1380  case TASKGROUP:
1381  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1382  break;
1383  default:
1384  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1385  break;
1386  }
1387 #endif // KMP_STATS_ENABLED
1388 #endif // OMP_40_ENABLED
1389 
1390 // OMPT task begin
1391 #if OMPT_SUPPORT
1392  if (UNLIKELY(ompt_enabled.enabled))
1393  __ompt_task_start(task, current_task, gtid);
1394 #endif
1395 
1396 #ifdef KMP_GOMP_COMPAT
1397  if (taskdata->td_flags.native) {
1398  ((void (*)(void *))(*(task->routine)))(task->shareds);
1399  } else
1400 #endif /* KMP_GOMP_COMPAT */
1401  {
1402  (*(task->routine))(gtid, task);
1403  }
1404  KMP_POP_PARTITIONED_TIMER();
1405 
1406 #if OMP_40_ENABLED
1407  }
1408 #endif // OMP_40_ENABLED
1409 
1410 
1411 #if OMP_45_ENABLED
1412  // Proxy tasks are not handled by the runtime
1413  if (taskdata->td_flags.proxy != TASK_PROXY) {
1414 #endif
1415  ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1416 #if OMPT_SUPPORT
1417  if (UNLIKELY(ompt_enabled.enabled)) {
1418  thread->th.ompt_thread_info = oldInfo;
1419  if (taskdata->td_flags.tiedness == TASK_TIED) {
1420  taskdata->ompt_task_info.frame.exit_frame = NULL;
1421  }
1422  __kmp_task_finish<true>(gtid, task, current_task);
1423  } else
1424 #endif
1425  __kmp_task_finish<false>(gtid, task, current_task);
1426 #if OMP_45_ENABLED
1427  }
1428 #endif
1429 
1430 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1431  // Barrier imbalance - correct arrive time after the task finished
1432  if (__kmp_forkjoin_frames_mode == 3) {
1433  kmp_info_t *this_thr = __kmp_threads[gtid];
1434  if (this_thr->th.th_bar_arrive_time) {
1435  this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1436  }
1437  }
1438 #endif
1439  KA_TRACE(
1440  30,
1441  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1442  gtid, taskdata, current_task));
1443  return;
1444 }
1445 
1446 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1447 //
1448 // loc_ref: location of original task pragma (ignored)
1449 // gtid: Global Thread ID of encountering thread
1450 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1451 // Returns:
1452 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1453 // be resumed later.
1454 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1455 // resumed later.
1456 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1457  kmp_task_t *new_task) {
1458  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1459 
1460  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1461  loc_ref, new_taskdata));
1462 
1463 #if OMPT_SUPPORT
1464  kmp_taskdata_t *parent;
1465  if (UNLIKELY(ompt_enabled.enabled)) {
1466  parent = new_taskdata->td_parent;
1467  if (ompt_enabled.ompt_callback_task_create) {
1468  ompt_data_t task_data = ompt_data_none;
1469  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1470  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1471  parent ? &(parent->ompt_task_info.frame) : NULL,
1472  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1473  OMPT_GET_RETURN_ADDRESS(0));
1474  }
1475  }
1476 #endif
1477 
1478  /* Should we execute the new task or queue it? For now, let's just always try
1479  to queue it. If the queue fills up, then we'll execute it. */
1480 
1481  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1482  { // Execute this task immediately
1483  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1484  new_taskdata->td_flags.task_serial = 1;
1485  __kmp_invoke_task(gtid, new_task, current_task);
1486  }
1487 
1488  KA_TRACE(
1489  10,
1490  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1491  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1492  gtid, loc_ref, new_taskdata));
1493 
1494  ANNOTATE_HAPPENS_BEFORE(new_task);
1495 #if OMPT_SUPPORT
1496  if (UNLIKELY(ompt_enabled.enabled)) {
1497  parent->ompt_task_info.frame.enter_frame = NULL;
1498  }
1499 #endif
1500  return TASK_CURRENT_NOT_QUEUED;
1501 }
1502 
1503 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1504 //
1505 // gtid: Global Thread ID of encountering thread
1506 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1507 // serialize_immediate: if TRUE then if the task is executed immediately its
1508 // execution will be serialized
1509 // Returns:
1510 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1511 // be resumed later.
1512 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1513 // resumed later.
1514 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1515  bool serialize_immediate) {
1516  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1517 
1518 /* Should we execute the new task or queue it? For now, let's just always try to
1519  queue it. If the queue fills up, then we'll execute it. */
1520 #if OMP_45_ENABLED
1521  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1522  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1523 #else
1524  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1525 #endif
1526  { // Execute this task immediately
1527  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1528  if (serialize_immediate)
1529  new_taskdata->td_flags.task_serial = 1;
1530  __kmp_invoke_task(gtid, new_task, current_task);
1531  }
1532 
1533  ANNOTATE_HAPPENS_BEFORE(new_task);
1534  return TASK_CURRENT_NOT_QUEUED;
1535 }
1536 
1537 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1538 // non-thread-switchable task from the parent thread only!
1539 //
1540 // loc_ref: location of original task pragma (ignored)
1541 // gtid: Global Thread ID of encountering thread
1542 // new_task: non-thread-switchable task thunk allocated by
1543 // __kmp_omp_task_alloc()
1544 // Returns:
1545 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1546 // be resumed later.
1547 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1548 // resumed later.
1549 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1550  kmp_task_t *new_task) {
1551  kmp_int32 res;
1552  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1553 
1554 #if KMP_DEBUG || OMPT_SUPPORT
1555  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1556 #endif
1557  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1558  new_taskdata));
1559 
1560 #if OMPT_SUPPORT
1561  kmp_taskdata_t *parent = NULL;
1562  if (UNLIKELY(ompt_enabled.enabled)) {
1563  if (!new_taskdata->td_flags.started) {
1564  OMPT_STORE_RETURN_ADDRESS(gtid);
1565  parent = new_taskdata->td_parent;
1566  if (!parent->ompt_task_info.frame.enter_frame) {
1567  parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1568  }
1569  if (ompt_enabled.ompt_callback_task_create) {
1570  ompt_data_t task_data = ompt_data_none;
1571  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1572  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1573  parent ? &(parent->ompt_task_info.frame) : NULL,
1574  &(new_taskdata->ompt_task_info.task_data),
1575  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1576  OMPT_LOAD_RETURN_ADDRESS(gtid));
1577  }
1578  } else {
1579  // We are scheduling the continuation of an UNTIED task.
1580  // Scheduling back to the parent task.
1581  __ompt_task_finish(new_task,
1582  new_taskdata->ompt_task_info.scheduling_parent,
1583  ompt_task_others);
1584  new_taskdata->ompt_task_info.frame.exit_frame = NULL;
1585  }
1586  }
1587 #endif
1588 
1589  res = __kmp_omp_task(gtid, new_task, true);
1590 
1591  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1592  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1593  gtid, loc_ref, new_taskdata));
1594 #if OMPT_SUPPORT
1595  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1596  parent->ompt_task_info.frame.enter_frame = NULL;
1597  }
1598 #endif
1599  return res;
1600 }
1601 
1602 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1603 // a taskloop task with the correct OMPT return address
1604 //
1605 // loc_ref: location of original task pragma (ignored)
1606 // gtid: Global Thread ID of encountering thread
1607 // new_task: non-thread-switchable task thunk allocated by
1608 // __kmp_omp_task_alloc()
1609 // codeptr_ra: return address for OMPT callback
1610 // Returns:
1611 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1612 // be resumed later.
1613 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1614 // resumed later.
1615 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1616  kmp_task_t *new_task, void *codeptr_ra) {
1617  kmp_int32 res;
1618  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1619 
1620 #if KMP_DEBUG || OMPT_SUPPORT
1621  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1622 #endif
1623  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1624  new_taskdata));
1625 
1626 #if OMPT_SUPPORT
1627  kmp_taskdata_t *parent = NULL;
1628  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1629  parent = new_taskdata->td_parent;
1630  if (!parent->ompt_task_info.frame.enter_frame)
1631  parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1632  if (ompt_enabled.ompt_callback_task_create) {
1633  ompt_data_t task_data = ompt_data_none;
1634  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1635  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1636  parent ? &(parent->ompt_task_info.frame) : NULL,
1637  &(new_taskdata->ompt_task_info.task_data),
1638  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1639  codeptr_ra);
1640  }
1641  }
1642 #endif
1643 
1644  res = __kmp_omp_task(gtid, new_task, true);
1645 
1646  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1647  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1648  gtid, loc_ref, new_taskdata));
1649 #if OMPT_SUPPORT
1650  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1651  parent->ompt_task_info.frame.enter_frame = NULL;
1652  }
1653 #endif
1654  return res;
1655 }
1656 
1657 template <bool ompt>
1658 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1659  void *frame_address,
1660  void *return_address) {
1661  kmp_taskdata_t *taskdata;
1662  kmp_info_t *thread;
1663  int thread_finished = FALSE;
1664  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1665 
1666  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1667 
1668  if (__kmp_tasking_mode != tskm_immediate_exec) {
1669  thread = __kmp_threads[gtid];
1670  taskdata = thread->th.th_current_task;
1671 
1672 #if OMPT_SUPPORT && OMPT_OPTIONAL
1673  ompt_data_t *my_task_data;
1674  ompt_data_t *my_parallel_data;
1675 
1676  if (ompt) {
1677  my_task_data = &(taskdata->ompt_task_info.task_data);
1678  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1679 
1680  taskdata->ompt_task_info.frame.enter_frame = frame_address;
1681 
1682  if (ompt_enabled.ompt_callback_sync_region) {
1683  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1684  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1685  my_task_data, return_address);
1686  }
1687 
1688  if (ompt_enabled.ompt_callback_sync_region_wait) {
1689  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1690  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1691  my_task_data, return_address);
1692  }
1693  }
1694 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1695 
1696 // Debugger: The taskwait is active. Store location and thread encountered the
1697 // taskwait.
1698 #if USE_ITT_BUILD
1699 // Note: These values are used by ITT events as well.
1700 #endif /* USE_ITT_BUILD */
1701  taskdata->td_taskwait_counter += 1;
1702  taskdata->td_taskwait_ident = loc_ref;
1703  taskdata->td_taskwait_thread = gtid + 1;
1704 
1705 #if USE_ITT_BUILD
1706  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1707  if (itt_sync_obj != NULL)
1708  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1709 #endif /* USE_ITT_BUILD */
1710 
1711  bool must_wait =
1712  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1713 
1714 #if OMP_45_ENABLED
1715  must_wait = must_wait || (thread->th.th_task_team != NULL &&
1716  thread->th.th_task_team->tt.tt_found_proxy_tasks);
1717 #endif
1718  if (must_wait) {
1719  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1720  &(taskdata->td_incomplete_child_tasks)),
1721  0U);
1722  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1723  flag.execute_tasks(thread, gtid, FALSE,
1724  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1725  __kmp_task_stealing_constraint);
1726  }
1727  }
1728 #if USE_ITT_BUILD
1729  if (itt_sync_obj != NULL)
1730  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1731 #endif /* USE_ITT_BUILD */
1732 
1733  // Debugger: The taskwait is completed. Location remains, but thread is
1734  // negated.
1735  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1736 
1737 #if OMPT_SUPPORT && OMPT_OPTIONAL
1738  if (ompt) {
1739  if (ompt_enabled.ompt_callback_sync_region_wait) {
1740  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1741  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1742  my_task_data, return_address);
1743  }
1744  if (ompt_enabled.ompt_callback_sync_region) {
1745  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1746  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1747  my_task_data, return_address);
1748  }
1749  taskdata->ompt_task_info.frame.enter_frame = NULL;
1750  }
1751 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1752 
1753  ANNOTATE_HAPPENS_AFTER(taskdata);
1754  }
1755 
1756  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1757  "returning TASK_CURRENT_NOT_QUEUED\n",
1758  gtid, taskdata));
1759 
1760  return TASK_CURRENT_NOT_QUEUED;
1761 }
1762 
1763 #if OMPT_SUPPORT
1764 OMPT_NOINLINE
1765 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1766  void *frame_address,
1767  void *return_address) {
1768  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1769  return_address);
1770 }
1771 #endif // OMPT_SUPPORT
1772 
1773 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1774 // complete
1775 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1776 #if OMPT_SUPPORT && OMPT_OPTIONAL
1777  if (UNLIKELY(ompt_enabled.enabled)) {
1778  OMPT_STORE_RETURN_ADDRESS(gtid);
1779  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(1),
1780  OMPT_LOAD_RETURN_ADDRESS(gtid));
1781  }
1782 #endif
1783  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1784 }
1785 
1786 // __kmpc_omp_taskyield: switch to a different task
1787 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1788  kmp_taskdata_t *taskdata;
1789  kmp_info_t *thread;
1790  int thread_finished = FALSE;
1791 
1792  KMP_COUNT_BLOCK(OMP_TASKYIELD);
1793  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1794 
1795  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1796  gtid, loc_ref, end_part));
1797 
1798  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1799  thread = __kmp_threads[gtid];
1800  taskdata = thread->th.th_current_task;
1801 // Should we model this as a task wait or not?
1802 // Debugger: The taskwait is active. Store location and thread encountered the
1803 // taskwait.
1804 #if USE_ITT_BUILD
1805 // Note: These values are used by ITT events as well.
1806 #endif /* USE_ITT_BUILD */
1807  taskdata->td_taskwait_counter += 1;
1808  taskdata->td_taskwait_ident = loc_ref;
1809  taskdata->td_taskwait_thread = gtid + 1;
1810 
1811 #if USE_ITT_BUILD
1812  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1813  if (itt_sync_obj != NULL)
1814  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1815 #endif /* USE_ITT_BUILD */
1816  if (!taskdata->td_flags.team_serial) {
1817  kmp_task_team_t *task_team = thread->th.th_task_team;
1818  if (task_team != NULL) {
1819  if (KMP_TASKING_ENABLED(task_team)) {
1820 #if OMPT_SUPPORT
1821  if (UNLIKELY(ompt_enabled.enabled))
1822  thread->th.ompt_thread_info.ompt_task_yielded = 1;
1823 #endif
1824  __kmp_execute_tasks_32(
1825  thread, gtid, NULL, FALSE,
1826  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1827  __kmp_task_stealing_constraint);
1828 #if OMPT_SUPPORT
1829  if (UNLIKELY(ompt_enabled.enabled))
1830  thread->th.ompt_thread_info.ompt_task_yielded = 0;
1831 #endif
1832  }
1833  }
1834  }
1835 #if USE_ITT_BUILD
1836  if (itt_sync_obj != NULL)
1837  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1838 #endif /* USE_ITT_BUILD */
1839 
1840  // Debugger: The taskwait is completed. Location remains, but thread is
1841  // negated.
1842  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1843  }
1844 
1845  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1846  "returning TASK_CURRENT_NOT_QUEUED\n",
1847  gtid, taskdata));
1848 
1849  return TASK_CURRENT_NOT_QUEUED;
1850 }
1851 
1852 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1853 #if OMP_45_ENABLED
1854 // Task Reduction implementation
1855 
1856 typedef struct kmp_task_red_flags {
1857  unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
1858  unsigned reserved31 : 31;
1859 } kmp_task_red_flags_t;
1860 
1861 // internal structure for reduction data item related info
1862 typedef struct kmp_task_red_data {
1863  void *reduce_shar; // shared reduction item
1864  size_t reduce_size; // size of data item
1865  void *reduce_priv; // thread specific data
1866  void *reduce_pend; // end of private data for comparison op
1867  void *reduce_init; // data initialization routine
1868  void *reduce_fini; // data finalization routine
1869  void *reduce_comb; // data combiner routine
1870  kmp_task_red_flags_t flags; // flags for additional info from compiler
1871 } kmp_task_red_data_t;
1872 
1873 // structure sent us by compiler - one per reduction item
1874 typedef struct kmp_task_red_input {
1875  void *reduce_shar; // shared reduction item
1876  size_t reduce_size; // size of data item
1877  void *reduce_init; // data initialization routine
1878  void *reduce_fini; // data finalization routine
1879  void *reduce_comb; // data combiner routine
1880  kmp_task_red_flags_t flags; // flags for additional info from compiler
1881 } kmp_task_red_input_t;
1882 
1892 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
1893  kmp_info_t *thread = __kmp_threads[gtid];
1894  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
1895  kmp_int32 nth = thread->th.th_team_nproc;
1896  kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
1897  kmp_task_red_data_t *arr;
1898 
1899  // check input data just in case
1900  KMP_ASSERT(tg != NULL);
1901  KMP_ASSERT(data != NULL);
1902  KMP_ASSERT(num > 0);
1903  if (nth == 1) {
1904  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
1905  gtid, tg));
1906  return (void *)tg;
1907  }
1908  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
1909  gtid, tg, num));
1910  arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
1911  thread, num * sizeof(kmp_task_red_data_t));
1912  for (int i = 0; i < num; ++i) {
1913  void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
1914  size_t size = input[i].reduce_size - 1;
1915  // round the size up to cache line per thread-specific item
1916  size += CACHE_LINE - size % CACHE_LINE;
1917  KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
1918  arr[i].reduce_shar = input[i].reduce_shar;
1919  arr[i].reduce_size = size;
1920  arr[i].reduce_init = input[i].reduce_init;
1921  arr[i].reduce_fini = input[i].reduce_fini;
1922  arr[i].reduce_comb = input[i].reduce_comb;
1923  arr[i].flags = input[i].flags;
1924  if (!input[i].flags.lazy_priv) {
1925  // allocate cache-line aligned block and fill it with zeros
1926  arr[i].reduce_priv = __kmp_allocate(nth * size);
1927  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
1928  if (f_init != NULL) {
1929  // initialize thread-specific items
1930  for (int j = 0; j < nth; ++j) {
1931  f_init((char *)(arr[i].reduce_priv) + j * size);
1932  }
1933  }
1934  } else {
1935  // only allocate space for pointers now,
1936  // objects will be lazily allocated/initialized once requested
1937  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
1938  }
1939  }
1940  tg->reduce_data = (void *)arr;
1941  tg->reduce_num_data = num;
1942  return (void *)tg;
1943 }
1944 
1954 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
1955  kmp_info_t *thread = __kmp_threads[gtid];
1956  kmp_int32 nth = thread->th.th_team_nproc;
1957  if (nth == 1)
1958  return data; // nothing to do
1959 
1960  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
1961  if (tg == NULL)
1962  tg = thread->th.th_current_task->td_taskgroup;
1963  KMP_ASSERT(tg != NULL);
1964  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
1965  kmp_int32 num = tg->reduce_num_data;
1966  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1967 
1968  KMP_ASSERT(data != NULL);
1969  while (tg != NULL) {
1970  for (int i = 0; i < num; ++i) {
1971  if (!arr[i].flags.lazy_priv) {
1972  if (data == arr[i].reduce_shar ||
1973  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
1974  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
1975  } else {
1976  // check shared location first
1977  void **p_priv = (void **)(arr[i].reduce_priv);
1978  if (data == arr[i].reduce_shar)
1979  goto found;
1980  // check if we get some thread specific location as parameter
1981  for (int j = 0; j < nth; ++j)
1982  if (data == p_priv[j])
1983  goto found;
1984  continue; // not found, continue search
1985  found:
1986  if (p_priv[tid] == NULL) {
1987  // allocate thread specific object lazily
1988  void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
1989  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
1990  if (f_init != NULL) {
1991  f_init(p_priv[tid]);
1992  }
1993  }
1994  return p_priv[tid];
1995  }
1996  }
1997  tg = tg->parent;
1998  arr = (kmp_task_red_data_t *)(tg->reduce_data);
1999  num = tg->reduce_num_data;
2000  }
2001  KMP_ASSERT2(0, "Unknown task reduction item");
2002  return NULL; // ERROR, this line never executed
2003 }
2004 
2005 // Finalize task reduction.
2006 // Called from __kmpc_end_taskgroup()
2007 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2008  kmp_int32 nth = th->th.th_team_nproc;
2009  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2010  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
2011  kmp_int32 num = tg->reduce_num_data;
2012  for (int i = 0; i < num; ++i) {
2013  void *sh_data = arr[i].reduce_shar;
2014  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2015  void (*f_comb)(void *, void *) =
2016  (void (*)(void *, void *))(arr[i].reduce_comb);
2017  if (!arr[i].flags.lazy_priv) {
2018  void *pr_data = arr[i].reduce_priv;
2019  size_t size = arr[i].reduce_size;
2020  for (int j = 0; j < nth; ++j) {
2021  void *priv_data = (char *)pr_data + j * size;
2022  f_comb(sh_data, priv_data); // combine results
2023  if (f_fini)
2024  f_fini(priv_data); // finalize if needed
2025  }
2026  } else {
2027  void **pr_data = (void **)(arr[i].reduce_priv);
2028  for (int j = 0; j < nth; ++j) {
2029  if (pr_data[j] != NULL) {
2030  f_comb(sh_data, pr_data[j]); // combine results
2031  if (f_fini)
2032  f_fini(pr_data[j]); // finalize if needed
2033  __kmp_free(pr_data[j]);
2034  }
2035  }
2036  }
2037  __kmp_free(arr[i].reduce_priv);
2038  }
2039  __kmp_thread_free(th, arr);
2040  tg->reduce_data = NULL;
2041  tg->reduce_num_data = 0;
2042 }
2043 #endif
2044 
2045 #if OMP_40_ENABLED
2046 // __kmpc_taskgroup: Start a new taskgroup
2047 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2048  kmp_info_t *thread = __kmp_threads[gtid];
2049  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2050  kmp_taskgroup_t *tg_new =
2051  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2052  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2053  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2054  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2055  tg_new->parent = taskdata->td_taskgroup;
2056 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
2057 #if OMP_45_ENABLED
2058  tg_new->reduce_data = NULL;
2059  tg_new->reduce_num_data = 0;
2060 #endif
2061  taskdata->td_taskgroup = tg_new;
2062 
2063 #if OMPT_SUPPORT && OMPT_OPTIONAL
2064  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2065  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2066  if (!codeptr)
2067  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2068  kmp_team_t *team = thread->th.th_team;
2069  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2070  // FIXME: I think this is wrong for lwt!
2071  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2072 
2073  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2074  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2075  &(my_task_data), codeptr);
2076  }
2077 #endif
2078 }
2079 
2080 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2081 // and its descendants are complete
2082 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2083  kmp_info_t *thread = __kmp_threads[gtid];
2084  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2085  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2086  int thread_finished = FALSE;
2087 
2088 #if OMPT_SUPPORT && OMPT_OPTIONAL
2089  kmp_team_t *team;
2090  ompt_data_t my_task_data;
2091  ompt_data_t my_parallel_data;
2092  void *codeptr;
2093  if (UNLIKELY(ompt_enabled.enabled)) {
2094  team = thread->th.th_team;
2095  my_task_data = taskdata->ompt_task_info.task_data;
2096  // FIXME: I think this is wrong for lwt!
2097  my_parallel_data = team->t.ompt_team_info.parallel_data;
2098  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2099  if (!codeptr)
2100  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2101  }
2102 #endif
2103 
2104  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2105  KMP_DEBUG_ASSERT(taskgroup != NULL);
2106  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2107 
2108  if (__kmp_tasking_mode != tskm_immediate_exec) {
2109  // mark task as waiting not on a barrier
2110  taskdata->td_taskwait_counter += 1;
2111  taskdata->td_taskwait_ident = loc;
2112  taskdata->td_taskwait_thread = gtid + 1;
2113 #if USE_ITT_BUILD
2114  // For ITT the taskgroup wait is similar to taskwait until we need to
2115  // distinguish them
2116  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2117  if (itt_sync_obj != NULL)
2118  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2119 #endif /* USE_ITT_BUILD */
2120 
2121 #if OMPT_SUPPORT && OMPT_OPTIONAL
2122  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2123  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2124  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2125  &(my_task_data), codeptr);
2126  }
2127 #endif
2128 
2129 #if OMP_45_ENABLED
2130  if (!taskdata->td_flags.team_serial ||
2131  (thread->th.th_task_team != NULL &&
2132  thread->th.th_task_team->tt.tt_found_proxy_tasks))
2133 #else
2134  if (!taskdata->td_flags.team_serial)
2135 #endif
2136  {
2137  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2138  0U);
2139  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2140  flag.execute_tasks(thread, gtid, FALSE,
2141  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2142  __kmp_task_stealing_constraint);
2143  }
2144  }
2145  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2146 
2147 #if OMPT_SUPPORT && OMPT_OPTIONAL
2148  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2149  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2150  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2151  &(my_task_data), codeptr);
2152  }
2153 #endif
2154 
2155 #if USE_ITT_BUILD
2156  if (itt_sync_obj != NULL)
2157  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2158 #endif /* USE_ITT_BUILD */
2159  }
2160  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2161 
2162 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
2163 #if OMP_45_ENABLED
2164  if (taskgroup->reduce_data != NULL) // need to reduce?
2165  __kmp_task_reduction_fini(thread, taskgroup);
2166 #endif
2167  // Restore parent taskgroup for the current task
2168  taskdata->td_taskgroup = taskgroup->parent;
2169  __kmp_thread_free(thread, taskgroup);
2170 
2171  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2172  gtid, taskdata));
2173  ANNOTATE_HAPPENS_AFTER(taskdata);
2174 
2175 #if OMPT_SUPPORT && OMPT_OPTIONAL
2176  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2177  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2178  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2179  &(my_task_data), codeptr);
2180  }
2181 #endif
2182 }
2183 #endif
2184 
2185 // __kmp_remove_my_task: remove a task from my own deque
2186 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2187  kmp_task_team_t *task_team,
2188  kmp_int32 is_constrained) {
2189  kmp_task_t *task;
2190  kmp_taskdata_t *taskdata;
2191  kmp_thread_data_t *thread_data;
2192  kmp_uint32 tail;
2193 
2194  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2195  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2196  NULL); // Caller should check this condition
2197 
2198  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2199 
2200  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2201  gtid, thread_data->td.td_deque_ntasks,
2202  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2203 
2204  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2205  KA_TRACE(10,
2206  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2207  "ntasks=%d head=%u tail=%u\n",
2208  gtid, thread_data->td.td_deque_ntasks,
2209  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2210  return NULL;
2211  }
2212 
2213  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2214 
2215  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2216  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2217  KA_TRACE(10,
2218  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2219  "ntasks=%d head=%u tail=%u\n",
2220  gtid, thread_data->td.td_deque_ntasks,
2221  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2222  return NULL;
2223  }
2224 
2225  tail = (thread_data->td.td_deque_tail - 1) &
2226  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2227  taskdata = thread_data->td.td_deque[tail];
2228 
2229  if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
2230  // we need to check if the candidate obeys task scheduling constraint (TSC)
2231  // only descendant of all deferred tied tasks can be scheduled, checking
2232  // the last one is enough, as it in turn is the descendant of all others
2233  kmp_taskdata_t *current = thread->th.th_current_task->td_last_tied;
2234  KMP_DEBUG_ASSERT(current != NULL);
2235  // check if last tied task is not suspended on barrier
2236  if (current->td_flags.tasktype == TASK_EXPLICIT ||
2237  current->td_taskwait_thread > 0) { // <= 0 on barrier
2238  kmp_int32 level = current->td_level;
2239  kmp_taskdata_t *parent = taskdata->td_parent;
2240  while (parent != current && parent->td_level > level) {
2241  parent = parent->td_parent; // check generation up to the level of the
2242  // current task
2243  KMP_DEBUG_ASSERT(parent != NULL);
2244  }
2245  if (parent != current) {
2246  // The TSC does not allow to steal victim task
2247  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2248  KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2249  "ntasks=%d head=%u tail=%u\n",
2250  gtid, thread_data->td.td_deque_ntasks,
2251  thread_data->td.td_deque_head,
2252  thread_data->td.td_deque_tail));
2253  return NULL;
2254  }
2255  }
2256  }
2257 
2258  thread_data->td.td_deque_tail = tail;
2259  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2260 
2261  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2262 
2263  KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: "
2264  "ntasks=%d head=%u tail=%u\n",
2265  gtid, taskdata, thread_data->td.td_deque_ntasks,
2266  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2267 
2268  task = KMP_TASKDATA_TO_TASK(taskdata);
2269  return task;
2270 }
2271 
2272 // __kmp_steal_task: remove a task from another thread's deque
2273 // Assume that calling thread has already checked existence of
2274 // task_team thread_data before calling this routine.
2275 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2276  kmp_task_team_t *task_team,
2277  std::atomic<kmp_int32> *unfinished_threads,
2278  int *thread_finished,
2279  kmp_int32 is_constrained) {
2280  kmp_task_t *task;
2281  kmp_taskdata_t *taskdata;
2282  kmp_taskdata_t *current;
2283  kmp_thread_data_t *victim_td, *threads_data;
2284  kmp_int32 level, target;
2285  kmp_int32 victim_tid;
2286 
2287  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2288 
2289  threads_data = task_team->tt.tt_threads_data;
2290  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2291 
2292  victim_tid = victim_thr->th.th_info.ds.ds_tid;
2293  victim_td = &threads_data[victim_tid];
2294 
2295  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2296  "task_team=%p ntasks=%d head=%u tail=%u\n",
2297  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2298  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2299  victim_td->td.td_deque_tail));
2300 
2301  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2302  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2303  "task_team=%p ntasks=%d head=%u tail=%u\n",
2304  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2305  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2306  victim_td->td.td_deque_tail));
2307  return NULL;
2308  }
2309 
2310  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2311 
2312  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2313  // Check again after we acquire the lock
2314  if (ntasks == 0) {
2315  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2316  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2317  "task_team=%p ntasks=%d head=%u tail=%u\n",
2318  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2319  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2320  return NULL;
2321  }
2322 
2323  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2324 
2325  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2326  if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
2327  // we need to check if the candidate obeys task scheduling constraint (TSC)
2328  // only descendant of all deferred tied tasks can be scheduled, checking
2329  // the last one is enough, as it in turn is the descendant of all others
2330  current = __kmp_threads[gtid]->th.th_current_task->td_last_tied;
2331  KMP_DEBUG_ASSERT(current != NULL);
2332  // check if last tied task is not suspended on barrier
2333  if (current->td_flags.tasktype == TASK_EXPLICIT ||
2334  current->td_taskwait_thread > 0) { // <= 0 on barrier
2335  level = current->td_level;
2336  kmp_taskdata_t *parent = taskdata->td_parent;
2337  while (parent != current && parent->td_level > level) {
2338  parent = parent->td_parent; // check generation up to the level of the
2339  // current task
2340  KMP_DEBUG_ASSERT(parent != NULL);
2341  }
2342  if (parent != current) {
2343  if (!task_team->tt.tt_untied_task_encountered) {
2344  // The TSC does not allow to steal victim task
2345  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2346  KA_TRACE(10,
2347  ("__kmp_steal_task(exit #3): T#%d could not steal from "
2348  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2349  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2350  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2351  return NULL;
2352  }
2353  taskdata = NULL; // will check other tasks in victim's deque
2354  }
2355  }
2356  }
2357  if (taskdata != NULL) {
2358  // Bump head pointer and Wrap.
2359  victim_td->td.td_deque_head =
2360  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2361  } else {
2362  int i;
2363  // walk through victim's deque trying to steal any task
2364  target = victim_td->td.td_deque_head;
2365  for (i = 1; i < ntasks; ++i) {
2366  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2367  taskdata = victim_td->td.td_deque[target];
2368  if (taskdata->td_flags.tiedness == TASK_TIED) {
2369  // check if the candidate obeys the TSC
2370  kmp_taskdata_t *parent = taskdata->td_parent;
2371  // check generation up to the level of the current task
2372  while (parent != current && parent->td_level > level) {
2373  parent = parent->td_parent;
2374  KMP_DEBUG_ASSERT(parent != NULL);
2375  }
2376  if (parent != current) {
2377  // The TSC does not allow to steal the candidate
2378  taskdata = NULL;
2379  continue;
2380  } else {
2381  // found victim tied task
2382  break;
2383  }
2384  } else {
2385  // found victim untied task
2386  break;
2387  }
2388  }
2389  if (taskdata == NULL) {
2390  // No appropriate candidate to steal found
2391  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2392  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2393  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2394  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2395  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2396  return NULL;
2397  }
2398  int prev = target;
2399  for (i = i + 1; i < ntasks; ++i) {
2400  // shift remaining tasks in the deque left by 1
2401  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2402  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2403  prev = target;
2404  }
2405  KMP_DEBUG_ASSERT(victim_td->td.td_deque_tail ==
2406  ((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2407  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2408  }
2409  if (*thread_finished) {
2410  // We need to un-mark this victim as a finished victim. This must be done
2411  // before releasing the lock, or else other threads (starting with the
2412  // master victim) might be prematurely released from the barrier!!!
2413  kmp_int32 count;
2414 
2415  count = KMP_ATOMIC_INC(unfinished_threads);
2416 
2417  KA_TRACE(
2418  20,
2419  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2420  gtid, count + 1, task_team));
2421 
2422  *thread_finished = FALSE;
2423  }
2424  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2425 
2426  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2427 
2428  KMP_COUNT_BLOCK(TASK_stolen);
2429  KA_TRACE(10,
2430  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2431  "task_team=%p ntasks=%d head=%u tail=%u\n",
2432  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2433  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2434 
2435  task = KMP_TASKDATA_TO_TASK(taskdata);
2436  return task;
2437 }
2438 
2439 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2440 // condition is statisfied (return true) or there are none left (return false).
2441 //
2442 // final_spin is TRUE if this is the spin at the release barrier.
2443 // thread_finished indicates whether the thread is finished executing all
2444 // the tasks it has on its deque, and is at the release barrier.
2445 // spinner is the location on which to spin.
2446 // spinner == NULL means only execute a single task and return.
2447 // checker is the value to check to terminate the spin.
2448 template <class C>
2449 static inline int __kmp_execute_tasks_template(
2450  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2451  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2452  kmp_int32 is_constrained) {
2453  kmp_task_team_t *task_team = thread->th.th_task_team;
2454  kmp_thread_data_t *threads_data;
2455  kmp_task_t *task;
2456  kmp_info_t *other_thread;
2457  kmp_taskdata_t *current_task = thread->th.th_current_task;
2458  std::atomic<kmp_int32> *unfinished_threads;
2459  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2460  tid = thread->th.th_info.ds.ds_tid;
2461 
2462  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2463  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2464 
2465  if (task_team == NULL)
2466  return FALSE;
2467 
2468  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2469  "*thread_finished=%d\n",
2470  gtid, final_spin, *thread_finished));
2471 
2472  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2473  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2474  KMP_DEBUG_ASSERT(threads_data != NULL);
2475 
2476  nthreads = task_team->tt.tt_nproc;
2477  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2478 #if OMP_45_ENABLED
2479  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2480 #else
2481  KMP_DEBUG_ASSERT(nthreads > 1);
2482 #endif
2483  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2484 
2485  while (1) { // Outer loop keeps trying to find tasks in case of single thread
2486  // getting tasks from target constructs
2487  while (1) { // Inner loop to find a task and execute it
2488  task = NULL;
2489  if (use_own_tasks) { // check on own queue first
2490  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2491  }
2492  if ((task == NULL) && (nthreads > 1)) { // Steal a task
2493  int asleep = 1;
2494  use_own_tasks = 0;
2495  // Try to steal from the last place I stole from successfully.
2496  if (victim_tid == -2) { // haven't stolen anything yet
2497  victim_tid = threads_data[tid].td.td_deque_last_stolen;
2498  if (victim_tid !=
2499  -1) // if we have a last stolen from victim, get the thread
2500  other_thread = threads_data[victim_tid].td.td_thr;
2501  }
2502  if (victim_tid != -1) { // found last victim
2503  asleep = 0;
2504  } else if (!new_victim) { // no recent steals and we haven't already
2505  // used a new victim; select a random thread
2506  do { // Find a different thread to steal work from.
2507  // Pick a random thread. Initial plan was to cycle through all the
2508  // threads, and only return if we tried to steal from every thread,
2509  // and failed. Arch says that's not such a great idea.
2510  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2511  if (victim_tid >= tid) {
2512  ++victim_tid; // Adjusts random distribution to exclude self
2513  }
2514  // Found a potential victim
2515  other_thread = threads_data[victim_tid].td.td_thr;
2516  // There is a slight chance that __kmp_enable_tasking() did not wake
2517  // up all threads waiting at the barrier. If victim is sleeping,
2518  // then wake it up. Since we were going to pay the cache miss
2519  // penalty for referencing another thread's kmp_info_t struct
2520  // anyway,
2521  // the check shouldn't cost too much performance at this point. In
2522  // extra barrier mode, tasks do not sleep at the separate tasking
2523  // barrier, so this isn't a problem.
2524  asleep = 0;
2525  if ((__kmp_tasking_mode == tskm_task_teams) &&
2526  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2527  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2528  NULL)) {
2529  asleep = 1;
2530  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2531  other_thread->th.th_sleep_loc);
2532  // A sleeping thread should not have any tasks on it's queue.
2533  // There is a slight possibility that it resumes, steals a task
2534  // from another thread, which spawns more tasks, all in the time
2535  // that it takes this thread to check => don't write an assertion
2536  // that the victim's queue is empty. Try stealing from a
2537  // different thread.
2538  }
2539  } while (asleep);
2540  }
2541 
2542  if (!asleep) {
2543  // We have a victim to try to steal from
2544  task = __kmp_steal_task(other_thread, gtid, task_team,
2545  unfinished_threads, thread_finished,
2546  is_constrained);
2547  }
2548  if (task != NULL) { // set last stolen to victim
2549  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2550  threads_data[tid].td.td_deque_last_stolen = victim_tid;
2551  // The pre-refactored code did not try more than 1 successful new
2552  // vicitm, unless the last one generated more local tasks;
2553  // new_victim keeps track of this
2554  new_victim = 1;
2555  }
2556  } else { // No tasks found; unset last_stolen
2557  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2558  victim_tid = -2; // no successful victim found
2559  }
2560  }
2561 
2562  if (task == NULL) // break out of tasking loop
2563  break;
2564 
2565 // Found a task; execute it
2566 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2567  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2568  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2569  // get the object reliably
2570  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2571  }
2572  __kmp_itt_task_starting(itt_sync_obj);
2573  }
2574 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2575  __kmp_invoke_task(gtid, task, current_task);
2576 #if USE_ITT_BUILD
2577  if (itt_sync_obj != NULL)
2578  __kmp_itt_task_finished(itt_sync_obj);
2579 #endif /* USE_ITT_BUILD */
2580  // If this thread is only partway through the barrier and the condition is
2581  // met, then return now, so that the barrier gather/release pattern can
2582  // proceed. If this thread is in the last spin loop in the barrier,
2583  // waiting to be released, we know that the termination condition will not
2584  // be satisified, so don't waste any cycles checking it.
2585  if (flag == NULL || (!final_spin && flag->done_check())) {
2586  KA_TRACE(
2587  15,
2588  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2589  gtid));
2590  return TRUE;
2591  }
2592  if (thread->th.th_task_team == NULL) {
2593  break;
2594  }
2595  // Yield before executing next task
2596  KMP_YIELD(__kmp_library == library_throughput);
2597  // If execution of a stolen task results in more tasks being placed on our
2598  // run queue, reset use_own_tasks
2599  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2600  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2601  "other tasks, restart\n",
2602  gtid));
2603  use_own_tasks = 1;
2604  new_victim = 0;
2605  }
2606  }
2607 
2608 // The task source has been exhausted. If in final spin loop of barrier, check
2609 // if termination condition is satisfied.
2610 #if OMP_45_ENABLED
2611  // The work queue may be empty but there might be proxy tasks still
2612  // executing
2613  if (final_spin &&
2614  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0)
2615 #else
2616  if (final_spin)
2617 #endif
2618  {
2619  // First, decrement the #unfinished threads, if that has not already been
2620  // done. This decrement might be to the spin location, and result in the
2621  // termination condition being satisfied.
2622  if (!*thread_finished) {
2623  kmp_int32 count;
2624 
2625  count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
2626  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2627  "unfinished_threads to %d task_team=%p\n",
2628  gtid, count, task_team));
2629  *thread_finished = TRUE;
2630  }
2631 
2632  // It is now unsafe to reference thread->th.th_team !!!
2633  // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2634  // thread to pass through the barrier, where it might reset each thread's
2635  // th.th_team field for the next parallel region. If we can steal more
2636  // work, we know that this has not happened yet.
2637  if (flag != NULL && flag->done_check()) {
2638  KA_TRACE(
2639  15,
2640  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2641  gtid));
2642  return TRUE;
2643  }
2644  }
2645 
2646  // If this thread's task team is NULL, master has recognized that there are
2647  // no more tasks; bail out
2648  if (thread->th.th_task_team == NULL) {
2649  KA_TRACE(15,
2650  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2651  return FALSE;
2652  }
2653 
2654 #if OMP_45_ENABLED
2655  // We could be getting tasks from target constructs; if this is the only
2656  // thread, keep trying to execute tasks from own queue
2657  if (nthreads == 1)
2658  use_own_tasks = 1;
2659  else
2660 #endif
2661  {
2662  KA_TRACE(15,
2663  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
2664  return FALSE;
2665  }
2666  }
2667 }
2668 
2669 int __kmp_execute_tasks_32(
2670  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2671  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2672  kmp_int32 is_constrained) {
2673  return __kmp_execute_tasks_template(
2674  thread, gtid, flag, final_spin,
2675  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2676 }
2677 
2678 int __kmp_execute_tasks_64(
2679  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2680  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2681  kmp_int32 is_constrained) {
2682  return __kmp_execute_tasks_template(
2683  thread, gtid, flag, final_spin,
2684  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2685 }
2686 
2687 int __kmp_execute_tasks_oncore(
2688  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2689  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2690  kmp_int32 is_constrained) {
2691  return __kmp_execute_tasks_template(
2692  thread, gtid, flag, final_spin,
2693  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2694 }
2695 
2696 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2697 // next barrier so they can assist in executing enqueued tasks.
2698 // First thread in allocates the task team atomically.
2699 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
2700  kmp_info_t *this_thr) {
2701  kmp_thread_data_t *threads_data;
2702  int nthreads, i, is_init_thread;
2703 
2704  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
2705  __kmp_gtid_from_thread(this_thr)));
2706 
2707  KMP_DEBUG_ASSERT(task_team != NULL);
2708  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2709 
2710  nthreads = task_team->tt.tt_nproc;
2711  KMP_DEBUG_ASSERT(nthreads > 0);
2712  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2713 
2714  // Allocate or increase the size of threads_data if necessary
2715  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
2716 
2717  if (!is_init_thread) {
2718  // Some other thread already set up the array.
2719  KA_TRACE(
2720  20,
2721  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2722  __kmp_gtid_from_thread(this_thr)));
2723  return;
2724  }
2725  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2726  KMP_DEBUG_ASSERT(threads_data != NULL);
2727 
2728  if ((__kmp_tasking_mode == tskm_task_teams) &&
2729  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
2730  // Release any threads sleeping at the barrier, so that they can steal
2731  // tasks and execute them. In extra barrier mode, tasks do not sleep
2732  // at the separate tasking barrier, so this isn't a problem.
2733  for (i = 0; i < nthreads; i++) {
2734  volatile void *sleep_loc;
2735  kmp_info_t *thread = threads_data[i].td.td_thr;
2736 
2737  if (i == this_thr->th.th_info.ds.ds_tid) {
2738  continue;
2739  }
2740  // Since we haven't locked the thread's suspend mutex lock at this
2741  // point, there is a small window where a thread might be putting
2742  // itself to sleep, but hasn't set the th_sleep_loc field yet.
2743  // To work around this, __kmp_execute_tasks_template() periodically checks
2744  // see if other threads are sleeping (using the same random mechanism that
2745  // is used for task stealing) and awakens them if they are.
2746  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
2747  NULL) {
2748  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2749  __kmp_gtid_from_thread(this_thr),
2750  __kmp_gtid_from_thread(thread)));
2751  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2752  } else {
2753  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2754  __kmp_gtid_from_thread(this_thr),
2755  __kmp_gtid_from_thread(thread)));
2756  }
2757  }
2758  }
2759 
2760  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
2761  __kmp_gtid_from_thread(this_thr)));
2762 }
2763 
2764 /* // TODO: Check the comment consistency
2765  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
2766  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2767  * After a child * thread checks into a barrier and calls __kmp_release() from
2768  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2769  * longer assume that the kmp_team_t structure is intact (at any moment, the
2770  * master thread may exit the barrier code and free the team data structure,
2771  * and return the threads to the thread pool).
2772  *
2773  * This does not work with the the tasking code, as the thread is still
2774  * expected to participate in the execution of any tasks that may have been
2775  * spawned my a member of the team, and the thread still needs access to all
2776  * to each thread in the team, so that it can steal work from it.
2777  *
2778  * Enter the existence of the kmp_task_team_t struct. It employs a reference
2779  * counting mechanims, and is allocated by the master thread before calling
2780  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2781  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
2782  * of the kmp_task_team_t structs for consecutive barriers can overlap
2783  * (and will, unless the master thread is the last thread to exit the barrier
2784  * release phase, which is not typical).
2785  *
2786  * The existence of such a struct is useful outside the context of tasking,
2787  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2788  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2789  * libraries.
2790  *
2791  * We currently use the existence of the threads array as an indicator that
2792  * tasks were spawned since the last barrier. If the structure is to be
2793  * useful outside the context of tasking, then this will have to change, but
2794  * not settting the field minimizes the performance impact of tasking on
2795  * barriers, when no explicit tasks were spawned (pushed, actually).
2796  */
2797 
2798 static kmp_task_team_t *__kmp_free_task_teams =
2799  NULL; // Free list for task_team data structures
2800 // Lock for task team data structures
2801 kmp_bootstrap_lock_t __kmp_task_team_lock =
2802  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
2803 
2804 // __kmp_alloc_task_deque:
2805 // Allocates a task deque for a particular thread, and initialize the necessary
2806 // data structures relating to the deque. This only happens once per thread
2807 // per task team since task teams are recycled. No lock is needed during
2808 // allocation since each thread allocates its own deque.
2809 static void __kmp_alloc_task_deque(kmp_info_t *thread,
2810  kmp_thread_data_t *thread_data) {
2811  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
2812  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
2813 
2814  // Initialize last stolen task field to "none"
2815  thread_data->td.td_deque_last_stolen = -1;
2816 
2817  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
2818  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
2819  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
2820 
2821  KE_TRACE(
2822  10,
2823  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2824  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
2825  // Allocate space for task deque, and zero the deque
2826  // Cannot use __kmp_thread_calloc() because threads not around for
2827  // kmp_reap_task_team( ).
2828  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
2829  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2830  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2831 }
2832 
2833 // __kmp_realloc_task_deque:
2834 // Re-allocates a task deque for a particular thread, copies the content from
2835 // the old deque and adjusts the necessary data structures relating to the
2836 // deque. This operation must be done with a the deque_lock being held
2837 static void __kmp_realloc_task_deque(kmp_info_t *thread,
2838  kmp_thread_data_t *thread_data) {
2839  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
2840  kmp_int32 new_size = 2 * size;
2841 
2842  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
2843  "%d] for thread_data %p\n",
2844  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
2845 
2846  kmp_taskdata_t **new_deque =
2847  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
2848 
2849  int i, j;
2850  for (i = thread_data->td.td_deque_head, j = 0; j < size;
2851  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
2852  new_deque[j] = thread_data->td.td_deque[i];
2853 
2854  __kmp_free(thread_data->td.td_deque);
2855 
2856  thread_data->td.td_deque_head = 0;
2857  thread_data->td.td_deque_tail = size;
2858  thread_data->td.td_deque = new_deque;
2859  thread_data->td.td_deque_size = new_size;
2860 }
2861 
2862 // __kmp_free_task_deque:
2863 // Deallocates a task deque for a particular thread. Happens at library
2864 // deallocation so don't need to reset all thread data fields.
2865 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
2866  if (thread_data->td.td_deque != NULL) {
2867  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2868  TCW_4(thread_data->td.td_deque_ntasks, 0);
2869  __kmp_free(thread_data->td.td_deque);
2870  thread_data->td.td_deque = NULL;
2871  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2872  }
2873 
2874 #ifdef BUILD_TIED_TASK_STACK
2875  // GEH: Figure out what to do here for td_susp_tied_tasks
2876  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
2877  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
2878  }
2879 #endif // BUILD_TIED_TASK_STACK
2880 }
2881 
2882 // __kmp_realloc_task_threads_data:
2883 // Allocates a threads_data array for a task team, either by allocating an
2884 // initial array or enlarging an existing array. Only the first thread to get
2885 // the lock allocs or enlarges the array and re-initializes the array eleemnts.
2886 // That thread returns "TRUE", the rest return "FALSE".
2887 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2888 // The current size is given by task_team -> tt.tt_max_threads.
2889 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
2890  kmp_task_team_t *task_team) {
2891  kmp_thread_data_t **threads_data_p;
2892  kmp_int32 nthreads, maxthreads;
2893  int is_init_thread = FALSE;
2894 
2895  if (TCR_4(task_team->tt.tt_found_tasks)) {
2896  // Already reallocated and initialized.
2897  return FALSE;
2898  }
2899 
2900  threads_data_p = &task_team->tt.tt_threads_data;
2901  nthreads = task_team->tt.tt_nproc;
2902  maxthreads = task_team->tt.tt_max_threads;
2903 
2904  // All threads must lock when they encounter the first task of the implicit
2905  // task region to make sure threads_data fields are (re)initialized before
2906  // used.
2907  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2908 
2909  if (!TCR_4(task_team->tt.tt_found_tasks)) {
2910  // first thread to enable tasking
2911  kmp_team_t *team = thread->th.th_team;
2912  int i;
2913 
2914  is_init_thread = TRUE;
2915  if (maxthreads < nthreads) {
2916 
2917  if (*threads_data_p != NULL) {
2918  kmp_thread_data_t *old_data = *threads_data_p;
2919  kmp_thread_data_t *new_data = NULL;
2920 
2921  KE_TRACE(
2922  10,
2923  ("__kmp_realloc_task_threads_data: T#%d reallocating "
2924  "threads data for task_team %p, new_size = %d, old_size = %d\n",
2925  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
2926  // Reallocate threads_data to have more elements than current array
2927  // Cannot use __kmp_thread_realloc() because threads not around for
2928  // kmp_reap_task_team( ). Note all new array entries are initialized
2929  // to zero by __kmp_allocate().
2930  new_data = (kmp_thread_data_t *)__kmp_allocate(
2931  nthreads * sizeof(kmp_thread_data_t));
2932  // copy old data to new data
2933  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
2934  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
2935 
2936 #ifdef BUILD_TIED_TASK_STACK
2937  // GEH: Figure out if this is the right thing to do
2938  for (i = maxthreads; i < nthreads; i++) {
2939  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2940  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2941  }
2942 #endif // BUILD_TIED_TASK_STACK
2943  // Install the new data and free the old data
2944  (*threads_data_p) = new_data;
2945  __kmp_free(old_data);
2946  } else {
2947  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
2948  "threads data for task_team %p, size = %d\n",
2949  __kmp_gtid_from_thread(thread), task_team, nthreads));
2950  // Make the initial allocate for threads_data array, and zero entries
2951  // Cannot use __kmp_thread_calloc() because threads not around for
2952  // kmp_reap_task_team( ).
2953  ANNOTATE_IGNORE_WRITES_BEGIN();
2954  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
2955  nthreads * sizeof(kmp_thread_data_t));
2956  ANNOTATE_IGNORE_WRITES_END();
2957 #ifdef BUILD_TIED_TASK_STACK
2958  // GEH: Figure out if this is the right thing to do
2959  for (i = 0; i < nthreads; i++) {
2960  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2961  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2962  }
2963 #endif // BUILD_TIED_TASK_STACK
2964  }
2965  task_team->tt.tt_max_threads = nthreads;
2966  } else {
2967  // If array has (more than) enough elements, go ahead and use it
2968  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
2969  }
2970 
2971  // initialize threads_data pointers back to thread_info structures
2972  for (i = 0; i < nthreads; i++) {
2973  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2974  thread_data->td.td_thr = team->t.t_threads[i];
2975 
2976  if (thread_data->td.td_deque_last_stolen >= nthreads) {
2977  // The last stolen field survives across teams / barrier, and the number
2978  // of threads may have changed. It's possible (likely?) that a new
2979  // parallel region will exhibit the same behavior as previous region.
2980  thread_data->td.td_deque_last_stolen = -1;
2981  }
2982  }
2983 
2984  KMP_MB();
2985  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
2986  }
2987 
2988  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2989  return is_init_thread;
2990 }
2991 
2992 // __kmp_free_task_threads_data:
2993 // Deallocates a threads_data array for a task team, including any attached
2994 // tasking deques. Only occurs at library shutdown.
2995 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
2996  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2997  if (task_team->tt.tt_threads_data != NULL) {
2998  int i;
2999  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3000  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3001  }
3002  __kmp_free(task_team->tt.tt_threads_data);
3003  task_team->tt.tt_threads_data = NULL;
3004  }
3005  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3006 }
3007 
3008 // __kmp_allocate_task_team:
3009 // Allocates a task team associated with a specific team, taking it from
3010 // the global task team free list if possible. Also initializes data
3011 // structures.
3012 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3013  kmp_team_t *team) {
3014  kmp_task_team_t *task_team = NULL;
3015  int nthreads;
3016 
3017  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3018  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3019 
3020  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3021  // Take a task team from the task team pool
3022  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3023  if (__kmp_free_task_teams != NULL) {
3024  task_team = __kmp_free_task_teams;
3025  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3026  task_team->tt.tt_next = NULL;
3027  }
3028  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3029  }
3030 
3031  if (task_team == NULL) {
3032  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3033  "task team for team %p\n",
3034  __kmp_gtid_from_thread(thread), team));
3035  // Allocate a new task team if one is not available.
3036  // Cannot use __kmp_thread_malloc() because threads not around for
3037  // kmp_reap_task_team( ).
3038  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3039  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3040  // AC: __kmp_allocate zeroes returned memory
3041  // task_team -> tt.tt_threads_data = NULL;
3042  // task_team -> tt.tt_max_threads = 0;
3043  // task_team -> tt.tt_next = NULL;
3044  }
3045 
3046  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3047 #if OMP_45_ENABLED
3048  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3049 #endif
3050  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3051 
3052  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3053  TCW_4(task_team->tt.tt_active, TRUE);
3054 
3055  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3056  "unfinished_threads init'd to %d\n",
3057  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3058  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3059  return task_team;
3060 }
3061 
3062 // __kmp_free_task_team:
3063 // Frees the task team associated with a specific thread, and adds it
3064 // to the global task team free list.
3065 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3066  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3067  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3068 
3069  // Put task team back on free list
3070  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3071 
3072  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3073  task_team->tt.tt_next = __kmp_free_task_teams;
3074  TCW_PTR(__kmp_free_task_teams, task_team);
3075 
3076  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3077 }
3078 
3079 // __kmp_reap_task_teams:
3080 // Free all the task teams on the task team free list.
3081 // Should only be done during library shutdown.
3082 // Cannot do anything that needs a thread structure or gtid since they are
3083 // already gone.
3084 void __kmp_reap_task_teams(void) {
3085  kmp_task_team_t *task_team;
3086 
3087  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3088  // Free all task_teams on the free list
3089  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3090  while ((task_team = __kmp_free_task_teams) != NULL) {
3091  __kmp_free_task_teams = task_team->tt.tt_next;
3092  task_team->tt.tt_next = NULL;
3093 
3094  // Free threads_data if necessary
3095  if (task_team->tt.tt_threads_data != NULL) {
3096  __kmp_free_task_threads_data(task_team);
3097  }
3098  __kmp_free(task_team);
3099  }
3100  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3101  }
3102 }
3103 
3104 // __kmp_wait_to_unref_task_teams:
3105 // Some threads could still be in the fork barrier release code, possibly
3106 // trying to steal tasks. Wait for each thread to unreference its task team.
3107 void __kmp_wait_to_unref_task_teams(void) {
3108  kmp_info_t *thread;
3109  kmp_uint32 spins;
3110  int done;
3111 
3112  KMP_INIT_YIELD(spins);
3113 
3114  for (;;) {
3115  done = TRUE;
3116 
3117  // TODO: GEH - this may be is wrong because some sync would be necessary
3118  // in case threads are added to the pool during the traversal. Need to
3119  // verify that lock for thread pool is held when calling this routine.
3120  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3121  thread = thread->th.th_next_pool) {
3122 #if KMP_OS_WINDOWS
3123  DWORD exit_val;
3124 #endif
3125  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3126  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3127  __kmp_gtid_from_thread(thread)));
3128  continue;
3129  }
3130 #if KMP_OS_WINDOWS
3131  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3132  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3133  thread->th.th_task_team = NULL;
3134  continue;
3135  }
3136 #endif
3137 
3138  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3139 
3140  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3141  "unreference task_team\n",
3142  __kmp_gtid_from_thread(thread)));
3143 
3144  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3145  volatile void *sleep_loc;
3146  // If the thread is sleeping, awaken it.
3147  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3148  NULL) {
3149  KA_TRACE(
3150  10,
3151  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3152  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3153  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3154  }
3155  }
3156  }
3157  if (done) {
3158  break;
3159  }
3160 
3161  // If we are oversubscribed, or have waited a bit (and library mode is
3162  // throughput), yield. Pause is in the following code.
3163  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
3164  KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
3165  }
3166 }
3167 
3168 // __kmp_task_team_setup: Create a task_team for the current team, but use
3169 // an already created, unused one if it already exists.
3170 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3171  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3172 
3173  // If this task_team hasn't been created yet, allocate it. It will be used in
3174  // the region after the next.
3175  // If it exists, it is the current task team and shouldn't be touched yet as
3176  // it may still be in use.
3177  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3178  (always || team->t.t_nproc > 1)) {
3179  team->t.t_task_team[this_thr->th.th_task_state] =
3180  __kmp_allocate_task_team(this_thr, team);
3181  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
3182  "for team %d at parity=%d\n",
3183  __kmp_gtid_from_thread(this_thr),
3184  team->t.t_task_team[this_thr->th.th_task_state],
3185  ((team != NULL) ? team->t.t_id : -1),
3186  this_thr->th.th_task_state));
3187  }
3188 
3189  // After threads exit the release, they will call sync, and then point to this
3190  // other task_team; make sure it is allocated and properly initialized. As
3191  // threads spin in the barrier release phase, they will continue to use the
3192  // previous task_team struct(above), until they receive the signal to stop
3193  // checking for tasks (they can't safely reference the kmp_team_t struct,
3194  // which could be reallocated by the master thread). No task teams are formed
3195  // for serialized teams.
3196  if (team->t.t_nproc > 1) {
3197  int other_team = 1 - this_thr->th.th_task_state;
3198  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3199  team->t.t_task_team[other_team] =
3200  __kmp_allocate_task_team(this_thr, team);
3201  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
3202  "task_team %p for team %d at parity=%d\n",
3203  __kmp_gtid_from_thread(this_thr),
3204  team->t.t_task_team[other_team],
3205  ((team != NULL) ? team->t.t_id : -1), other_team));
3206  } else { // Leave the old task team struct in place for the upcoming region;
3207  // adjust as needed
3208  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3209  if (!task_team->tt.tt_active ||
3210  team->t.t_nproc != task_team->tt.tt_nproc) {
3211  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3212  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3213 #if OMP_45_ENABLED
3214  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3215 #endif
3216  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3217  team->t.t_nproc);
3218  TCW_4(task_team->tt.tt_active, TRUE);
3219  }
3220  // if team size has changed, the first thread to enable tasking will
3221  // realloc threads_data if necessary
3222  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
3223  "%p for team %d at parity=%d\n",
3224  __kmp_gtid_from_thread(this_thr),
3225  team->t.t_task_team[other_team],
3226  ((team != NULL) ? team->t.t_id : -1), other_team));
3227  }
3228  }
3229 }
3230 
3231 // __kmp_task_team_sync: Propagation of task team data from team to threads
3232 // which happens just after the release phase of a team barrier. This may be
3233 // called by any thread, but only for teams with # threads > 1.
3234 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3235  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3236 
3237  // Toggle the th_task_state field, to switch which task_team this thread
3238  // refers to
3239  this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3240  // It is now safe to propagate the task team pointer from the team struct to
3241  // the current thread.
3242  TCW_PTR(this_thr->th.th_task_team,
3243  team->t.t_task_team[this_thr->th.th_task_state]);
3244  KA_TRACE(20,
3245  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3246  "%p from Team #%d (parity=%d)\n",
3247  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3248  ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3249 }
3250 
3251 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
3252 // barrier gather phase. Only called by master thread if #threads in team > 1 or
3253 // if proxy tasks were created.
3254 //
3255 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3256 // by passing in 0 optionally as the last argument. When wait is zero, master
3257 // thread does not wait for unfinished_threads to reach 0.
3258 void __kmp_task_team_wait(
3259  kmp_info_t *this_thr,
3260  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3261  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3262 
3263  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3264  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3265 
3266  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3267  if (wait) {
3268  KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
3269  "(for unfinished_threads to reach 0) on task_team = %p\n",
3270  __kmp_gtid_from_thread(this_thr), task_team));
3271  // Worker threads may have dropped through to release phase, but could
3272  // still be executing tasks. Wait here for tasks to complete. To avoid
3273  // memory contention, only master thread checks termination condition.
3274  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3275  &task_team->tt.tt_unfinished_threads),
3276  0U);
3277  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3278  }
3279  // Deactivate the old task team, so that the worker threads will stop
3280  // referencing it while spinning.
3281  KA_TRACE(
3282  20,
3283  ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3284  "setting active to false, setting local and team's pointer to NULL\n",
3285  __kmp_gtid_from_thread(this_thr), task_team));
3286 #if OMP_45_ENABLED
3287  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3288  task_team->tt.tt_found_proxy_tasks == TRUE);
3289  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3290 #else
3291  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
3292 #endif
3293  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3294  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3295  KMP_MB();
3296 
3297  TCW_PTR(this_thr->th.th_task_team, NULL);
3298  }
3299 }
3300 
3301 // __kmp_tasking_barrier:
3302 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
3303 // Internal function to execute all tasks prior to a regular barrier or a join
3304 // barrier. It is a full barrier itself, which unfortunately turns regular
3305 // barriers into double barriers and join barriers into 1 1/2 barriers.
3306 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3307  std::atomic<kmp_uint32> *spin = RCAST(
3308  std::atomic<kmp_uint32> *,
3309  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3310  int flag = FALSE;
3311  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3312 
3313 #if USE_ITT_BUILD
3314  KMP_FSYNC_SPIN_INIT(spin, NULL);
3315 #endif /* USE_ITT_BUILD */
3316  kmp_flag_32 spin_flag(spin, 0U);
3317  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3318  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3319 #if USE_ITT_BUILD
3320  // TODO: What about itt_sync_obj??
3321  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3322 #endif /* USE_ITT_BUILD */
3323 
3324  if (TCR_4(__kmp_global.g.g_done)) {
3325  if (__kmp_global.g.g_abort)
3326  __kmp_abort_thread();
3327  break;
3328  }
3329  KMP_YIELD(TRUE); // GH: We always yield here
3330  }
3331 #if USE_ITT_BUILD
3332  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3333 #endif /* USE_ITT_BUILD */
3334 }
3335 
3336 #if OMP_45_ENABLED
3337 
3338 // __kmp_give_task puts a task into a given thread queue if:
3339 // - the queue for that thread was created
3340 // - there's space in that queue
3341 // Because of this, __kmp_push_task needs to check if there's space after
3342 // getting the lock
3343 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3344  kmp_int32 pass) {
3345  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3346  kmp_task_team_t *task_team = taskdata->td_task_team;
3347 
3348  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3349  taskdata, tid));
3350 
3351  // If task_team is NULL something went really bad...
3352  KMP_DEBUG_ASSERT(task_team != NULL);
3353 
3354  bool result = false;
3355  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3356 
3357  if (thread_data->td.td_deque == NULL) {
3358  // There's no queue in this thread, go find another one
3359  // We're guaranteed that at least one thread has a queue
3360  KA_TRACE(30,
3361  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3362  tid, taskdata));
3363  return result;
3364  }
3365 
3366  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3367  TASK_DEQUE_SIZE(thread_data->td)) {
3368  KA_TRACE(
3369  30,
3370  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3371  taskdata, tid));
3372 
3373  // if this deque is bigger than the pass ratio give a chance to another
3374  // thread
3375  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3376  return result;
3377 
3378  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3379  __kmp_realloc_task_deque(thread, thread_data);
3380 
3381  } else {
3382 
3383  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3384 
3385  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3386  TASK_DEQUE_SIZE(thread_data->td)) {
3387  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3388  "thread %d.\n",
3389  taskdata, tid));
3390 
3391  // if this deque is bigger than the pass ratio give a chance to another
3392  // thread
3393  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3394  goto release_and_exit;
3395 
3396  __kmp_realloc_task_deque(thread, thread_data);
3397  }
3398  }
3399 
3400  // lock is held here, and there is space in the deque
3401 
3402  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3403  // Wrap index.
3404  thread_data->td.td_deque_tail =
3405  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3406  TCW_4(thread_data->td.td_deque_ntasks,
3407  TCR_4(thread_data->td.td_deque_ntasks) + 1);
3408 
3409  result = true;
3410  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3411  taskdata, tid));
3412 
3413 release_and_exit:
3414  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3415 
3416  return result;
3417 }
3418 
3419 /* The finish of the proxy tasks is divided in two pieces:
3420  - the top half is the one that can be done from a thread outside the team
3421  - the bottom half must be run from a them within the team
3422 
3423  In order to run the bottom half the task gets queued back into one of the
3424  threads of the team. Once the td_incomplete_child_task counter of the parent
3425  is decremented the threads can leave the barriers. So, the bottom half needs
3426  to be queued before the counter is decremented. The top half is therefore
3427  divided in two parts:
3428  - things that can be run before queuing the bottom half
3429  - things that must be run after queuing the bottom half
3430 
3431  This creates a second race as the bottom half can free the task before the
3432  second top half is executed. To avoid this we use the
3433  td_incomplete_child_task of the proxy task to synchronize the top and bottom
3434  half. */
3435 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3436  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3437  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3438  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3439  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3440 
3441  taskdata->td_flags.complete = 1; // mark the task as completed
3442 
3443  if (taskdata->td_taskgroup)
3444  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3445 
3446  // Create an imaginary children for this task so the bottom half cannot
3447  // release the task before we have completed the second top half
3448  KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3449 }
3450 
3451 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3452  kmp_int32 children = 0;
3453 
3454  // Predecrement simulated by "- 1" calculation
3455  children =
3456  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3457  KMP_DEBUG_ASSERT(children >= 0);
3458 
3459  // Remove the imaginary children
3460  KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3461 }
3462 
3463 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3464  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3465  kmp_info_t *thread = __kmp_threads[gtid];
3466 
3467  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3468  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3469  1); // top half must run before bottom half
3470 
3471  // We need to wait to make sure the top half is finished
3472  // Spinning here should be ok as this should happen quickly
3473  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3474  ;
3475 
3476  __kmp_release_deps(gtid, taskdata);
3477  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3478 }
3479 
3488 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3489  KMP_DEBUG_ASSERT(ptask != NULL);
3490  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3491  KA_TRACE(
3492  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3493  gtid, taskdata));
3494 
3495  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3496 
3497  __kmp_first_top_half_finish_proxy(taskdata);
3498  __kmp_second_top_half_finish_proxy(taskdata);
3499  __kmp_bottom_half_finish_proxy(gtid, ptask);
3500 
3501  KA_TRACE(10,
3502  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3503  gtid, taskdata));
3504 }
3505 
3513 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3514  KMP_DEBUG_ASSERT(ptask != NULL);
3515  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3516 
3517  KA_TRACE(
3518  10,
3519  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3520  taskdata));
3521 
3522  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3523 
3524  __kmp_first_top_half_finish_proxy(taskdata);
3525 
3526  // Enqueue task to complete bottom half completion from a thread within the
3527  // corresponding team
3528  kmp_team_t *team = taskdata->td_team;
3529  kmp_int32 nthreads = team->t.t_nproc;
3530  kmp_info_t *thread;
3531 
3532  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3533  // but we cannot use __kmp_get_random here
3534  kmp_int32 start_k = 0;
3535  kmp_int32 pass = 1;
3536  kmp_int32 k = start_k;
3537 
3538  do {
3539  // For now we're just linearly trying to find a thread
3540  thread = team->t.t_threads[k];
3541  k = (k + 1) % nthreads;
3542 
3543  // we did a full pass through all the threads
3544  if (k == start_k)
3545  pass = pass << 1;
3546 
3547  } while (!__kmp_give_task(thread, k, ptask, pass));
3548 
3549  __kmp_second_top_half_finish_proxy(taskdata);
3550 
3551  KA_TRACE(
3552  10,
3553  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3554  taskdata));
3555 }
3556 
3557 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3558 // for taskloop
3559 //
3560 // thread: allocating thread
3561 // task_src: pointer to source task to be duplicated
3562 // returns: a pointer to the allocated kmp_task_t structure (task).
3563 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3564  kmp_task_t *task;
3565  kmp_taskdata_t *taskdata;
3566  kmp_taskdata_t *taskdata_src;
3567  kmp_taskdata_t *parent_task = thread->th.th_current_task;
3568  size_t shareds_offset;
3569  size_t task_size;
3570 
3571  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3572  task_src));
3573  taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3574  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3575  TASK_FULL); // it should not be proxy task
3576  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3577  task_size = taskdata_src->td_size_alloc;
3578 
3579  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3580  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3581  task_size));
3582 #if USE_FAST_MEMORY
3583  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3584 #else
3585  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3586 #endif /* USE_FAST_MEMORY */
3587  KMP_MEMCPY(taskdata, taskdata_src, task_size);
3588 
3589  task = KMP_TASKDATA_TO_TASK(taskdata);
3590 
3591  // Initialize new task (only specific fields not affected by memcpy)
3592  taskdata->td_task_id = KMP_GEN_TASK_ID();
3593  if (task->shareds != NULL) { // need setup shareds pointer
3594  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3595  task->shareds = &((char *)taskdata)[shareds_offset];
3596  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3597  0);
3598  }
3599  taskdata->td_alloc_thread = thread;
3600  taskdata->td_parent = parent_task;
3601  taskdata->td_taskgroup =
3602  parent_task
3603  ->td_taskgroup; // task inherits the taskgroup from the parent task
3604 
3605  // Only need to keep track of child task counts if team parallel and tasking
3606  // not serialized
3607  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3608  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
3609  if (parent_task->td_taskgroup)
3610  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
3611  // Only need to keep track of allocated child tasks for explicit tasks since
3612  // implicit not deallocated
3613  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3614  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
3615  }
3616 
3617  KA_TRACE(20,
3618  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3619  thread, taskdata, taskdata->td_parent));
3620 #if OMPT_SUPPORT
3621  if (UNLIKELY(ompt_enabled.enabled))
3622  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
3623 #endif
3624  return task;
3625 }
3626 
3627 // Routine optionally generated by the compiler for setting the lastprivate flag
3628 // and calling needed constructors for private/firstprivate objects
3629 // (used to form taskloop tasks from pattern task)
3630 // Parameters: dest task, src task, lastprivate flag.
3631 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3632 
3633 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
3634 
3635 // class to encapsulate manipulating loop bounds in a taskloop task.
3636 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
3637 // the loop bound variables.
3638 class kmp_taskloop_bounds_t {
3639  kmp_task_t *task;
3640  const kmp_taskdata_t *taskdata;
3641  size_t lower_offset;
3642  size_t upper_offset;
3643 
3644 public:
3645  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
3646  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
3647  lower_offset((char *)lb - (char *)task),
3648  upper_offset((char *)ub - (char *)task) {
3649  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
3650  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
3651  }
3652  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
3653  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
3654  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
3655  size_t get_lower_offset() const { return lower_offset; }
3656  size_t get_upper_offset() const { return upper_offset; }
3657  kmp_uint64 get_lb() const {
3658  kmp_int64 retval;
3659 #if defined(KMP_GOMP_COMPAT)
3660  // Intel task just returns the lower bound normally
3661  if (!taskdata->td_flags.native) {
3662  retval = *(kmp_int64 *)((char *)task + lower_offset);
3663  } else {
3664  // GOMP task has to take into account the sizeof(long)
3665  if (taskdata->td_size_loop_bounds == 4) {
3666  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
3667  retval = (kmp_int64)*lb;
3668  } else {
3669  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
3670  retval = (kmp_int64)*lb;
3671  }
3672  }
3673 #else
3674  retval = *(kmp_int64 *)((char *)task + lower_offset);
3675 #endif // defined(KMP_GOMP_COMPAT)
3676  return retval;
3677  }
3678  kmp_uint64 get_ub() const {
3679  kmp_int64 retval;
3680 #if defined(KMP_GOMP_COMPAT)
3681  // Intel task just returns the upper bound normally
3682  if (!taskdata->td_flags.native) {
3683  retval = *(kmp_int64 *)((char *)task + upper_offset);
3684  } else {
3685  // GOMP task has to take into account the sizeof(long)
3686  if (taskdata->td_size_loop_bounds == 4) {
3687  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
3688  retval = (kmp_int64)*ub;
3689  } else {
3690  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
3691  retval = (kmp_int64)*ub;
3692  }
3693  }
3694 #else
3695  retval = *(kmp_int64 *)((char *)task + upper_offset);
3696 #endif // defined(KMP_GOMP_COMPAT)
3697  return retval;
3698  }
3699  void set_lb(kmp_uint64 lb) {
3700 #if defined(KMP_GOMP_COMPAT)
3701  // Intel task just sets the lower bound normally
3702  if (!taskdata->td_flags.native) {
3703  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
3704  } else {
3705  // GOMP task has to take into account the sizeof(long)
3706  if (taskdata->td_size_loop_bounds == 4) {
3707  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
3708  *lower = (kmp_uint32)lb;
3709  } else {
3710  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
3711  *lower = (kmp_uint64)lb;
3712  }
3713  }
3714 #else
3715  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
3716 #endif // defined(KMP_GOMP_COMPAT)
3717  }
3718  void set_ub(kmp_uint64 ub) {
3719 #if defined(KMP_GOMP_COMPAT)
3720  // Intel task just sets the upper bound normally
3721  if (!taskdata->td_flags.native) {
3722  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
3723  } else {
3724  // GOMP task has to take into account the sizeof(long)
3725  if (taskdata->td_size_loop_bounds == 4) {
3726  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
3727  *upper = (kmp_uint32)ub;
3728  } else {
3729  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
3730  *upper = (kmp_uint64)ub;
3731  }
3732  }
3733 #else
3734  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
3735 #endif // defined(KMP_GOMP_COMPAT)
3736  }
3737 };
3738 
3739 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
3740 //
3741 // loc Source location information
3742 // gtid Global thread ID
3743 // task Pattern task, exposes the loop iteration range
3744 // lb Pointer to loop lower bound in task structure
3745 // ub Pointer to loop upper bound in task structure
3746 // st Loop stride
3747 // ub_glob Global upper bound (used for lastprivate check)
3748 // num_tasks Number of tasks to execute
3749 // grainsize Number of loop iterations per task
3750 // extras Number of chunks with grainsize+1 iterations
3751 // tc Iterations count
3752 // task_dup Tasks duplication routine
3753 // codeptr_ra Return address for OMPT events
3754 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3755  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3756  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
3757  kmp_uint64 grainsize, kmp_uint64 extras,
3758  kmp_uint64 tc,
3759 #if OMPT_SUPPORT
3760  void *codeptr_ra,
3761 #endif
3762  void *task_dup) {
3763  KMP_COUNT_BLOCK(OMP_TASKLOOP);
3764  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3765  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3766  // compiler provides global bounds here
3767  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
3768  kmp_uint64 lower = task_bounds.get_lb();
3769  kmp_uint64 upper = task_bounds.get_ub();
3770  kmp_uint64 i;
3771  kmp_info_t *thread = __kmp_threads[gtid];
3772  kmp_taskdata_t *current_task = thread->th.th_current_task;
3773  kmp_task_t *next_task;
3774  kmp_int32 lastpriv = 0;
3775 
3776  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3777  KMP_DEBUG_ASSERT(num_tasks > extras);
3778  KMP_DEBUG_ASSERT(num_tasks > 0);
3779  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
3780  "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
3781  gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
3782  task_dup));
3783 
3784  // Launch num_tasks tasks, assign grainsize iterations each task
3785  for (i = 0; i < num_tasks; ++i) {
3786  kmp_uint64 chunk_minus_1;
3787  if (extras == 0) {
3788  chunk_minus_1 = grainsize - 1;
3789  } else {
3790  chunk_minus_1 = grainsize;
3791  --extras; // first extras iterations get bigger chunk (grainsize+1)
3792  }
3793  upper = lower + st * chunk_minus_1;
3794  if (i == num_tasks - 1) {
3795  // schedule the last task, set lastprivate flag if needed
3796  if (st == 1) { // most common case
3797  KMP_DEBUG_ASSERT(upper == *ub);
3798  if (upper == ub_glob)
3799  lastpriv = 1;
3800  } else if (st > 0) { // positive loop stride
3801  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
3802  if ((kmp_uint64)st > ub_glob - upper)
3803  lastpriv = 1;
3804  } else { // negative loop stride
3805  KMP_DEBUG_ASSERT(upper + st < *ub);
3806  if (upper - ub_glob < (kmp_uint64)(-st))
3807  lastpriv = 1;
3808  }
3809  }
3810  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3811  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
3812  kmp_taskloop_bounds_t next_task_bounds =
3813  kmp_taskloop_bounds_t(next_task, task_bounds);
3814 
3815  // adjust task-specific bounds
3816  next_task_bounds.set_lb(lower);
3817  if (next_taskdata->td_flags.native) {
3818  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
3819  } else {
3820  next_task_bounds.set_ub(upper);
3821  }
3822  if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
3823  ptask_dup(next_task, task, lastpriv);
3824  KA_TRACE(40,
3825  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
3826  "upper %lld stride %lld, (offsets %p %p)\n",
3827  gtid, i, next_task, lower, upper, st,
3828  next_task_bounds.get_lower_offset(),
3829  next_task_bounds.get_upper_offset()));
3830 #if OMPT_SUPPORT
3831  __kmp_omp_taskloop_task(NULL, gtid, next_task,
3832  codeptr_ra); // schedule new task
3833 #else
3834  __kmp_omp_task(gtid, next_task, true); // schedule new task
3835 #endif
3836  lower = upper + st; // adjust lower bound for the next iteration
3837  }
3838  // free the pattern task and exit
3839  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
3840  // do not execute the pattern task, just do internal bookkeeping
3841  __kmp_task_finish<false>(gtid, task, current_task);
3842 }
3843 
3844 // Structure to keep taskloop parameters for auxiliary task
3845 // kept in the shareds of the task structure.
3846 typedef struct __taskloop_params {
3847  kmp_task_t *task;
3848  kmp_uint64 *lb;
3849  kmp_uint64 *ub;
3850  void *task_dup;
3851  kmp_int64 st;
3852  kmp_uint64 ub_glob;
3853  kmp_uint64 num_tasks;
3854  kmp_uint64 grainsize;
3855  kmp_uint64 extras;
3856  kmp_uint64 tc;
3857  kmp_uint64 num_t_min;
3858 #if OMPT_SUPPORT
3859  void *codeptr_ra;
3860 #endif
3861 } __taskloop_params_t;
3862 
3863 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
3864  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
3865  kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
3866 #if OMPT_SUPPORT
3867  void *,
3868 #endif
3869  void *);
3870 
3871 // Execute part of the the taskloop submitted as a task.
3872 int __kmp_taskloop_task(int gtid, void *ptask) {
3873  __taskloop_params_t *p =
3874  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
3875  kmp_task_t *task = p->task;
3876  kmp_uint64 *lb = p->lb;
3877  kmp_uint64 *ub = p->ub;
3878  void *task_dup = p->task_dup;
3879  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3880  kmp_int64 st = p->st;
3881  kmp_uint64 ub_glob = p->ub_glob;
3882  kmp_uint64 num_tasks = p->num_tasks;
3883  kmp_uint64 grainsize = p->grainsize;
3884  kmp_uint64 extras = p->extras;
3885  kmp_uint64 tc = p->tc;
3886  kmp_uint64 num_t_min = p->num_t_min;
3887 #if OMPT_SUPPORT
3888  void *codeptr_ra = p->codeptr_ra;
3889 #endif
3890 #if KMP_DEBUG
3891  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3892  KMP_DEBUG_ASSERT(task != NULL);
3893  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
3894  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
3895  gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
3896  task_dup));
3897 #endif
3898  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
3899  if (num_tasks > num_t_min)
3900  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3901  grainsize, extras, tc, num_t_min,
3902 #if OMPT_SUPPORT
3903  codeptr_ra,
3904 #endif
3905  task_dup);
3906  else
3907  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3908  grainsize, extras, tc,
3909 #if OMPT_SUPPORT
3910  codeptr_ra,
3911 #endif
3912  task_dup);
3913 
3914  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
3915  return 0;
3916 }
3917 
3918 // Schedule part of the the taskloop as a task,
3919 // execute the rest of the the taskloop.
3920 //
3921 // loc Source location information
3922 // gtid Global thread ID
3923 // task Pattern task, exposes the loop iteration range
3924 // lb Pointer to loop lower bound in task structure
3925 // ub Pointer to loop upper bound in task structure
3926 // st Loop stride
3927 // ub_glob Global upper bound (used for lastprivate check)
3928 // num_tasks Number of tasks to execute
3929 // grainsize Number of loop iterations per task
3930 // extras Number of chunks with grainsize+1 iterations
3931 // tc Iterations count
3932 // num_t_min Threashold to launch tasks recursively
3933 // task_dup Tasks duplication routine
3934 // codeptr_ra Return address for OMPT events
3935 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
3936  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3937  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
3938  kmp_uint64 grainsize, kmp_uint64 extras,
3939  kmp_uint64 tc, kmp_uint64 num_t_min,
3940 #if OMPT_SUPPORT
3941  void *codeptr_ra,
3942 #endif
3943  void *task_dup) {
3944 #if KMP_DEBUG
3945  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3946  KMP_DEBUG_ASSERT(task != NULL);
3947  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
3948  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
3949  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
3950  gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
3951  task_dup));
3952 #endif
3953  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3954  kmp_uint64 lower = *lb;
3955  kmp_uint64 upper = *ub;
3956  kmp_info_t *thread = __kmp_threads[gtid];
3957  // kmp_taskdata_t *current_task = thread->th.th_current_task;
3958  kmp_task_t *next_task;
3959  kmp_int32 lastpriv = 0;
3960  size_t lower_offset =
3961  (char *)lb - (char *)task; // remember offset of lb in the task structure
3962  size_t upper_offset =
3963  (char *)ub - (char *)task; // remember offset of ub in the task structure
3964 
3965  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3966  KMP_DEBUG_ASSERT(num_tasks > extras);
3967  KMP_DEBUG_ASSERT(num_tasks > 0);
3968 
3969  // split the loop in two halves
3970  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
3971  kmp_uint64 gr_size0 = grainsize;
3972  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
3973  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
3974  if (n_tsk0 <= extras) {
3975  gr_size0++; // integrate extras into grainsize
3976  ext0 = 0; // no extra iters in 1st half
3977  ext1 = extras - n_tsk0; // remaining extras
3978  tc0 = gr_size0 * n_tsk0;
3979  tc1 = tc - tc0;
3980  } else { // n_tsk0 > extras
3981  ext1 = 0; // no extra iters in 2nd half
3982  ext0 = extras;
3983  tc1 = grainsize * n_tsk1;
3984  tc0 = tc - tc1;
3985  }
3986  ub0 = lower + st * (tc0 - 1);
3987  lb1 = ub0 + st;
3988 
3989  // create pattern task for 2nd half of the loop
3990  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
3991  // adjust lower bound (upper bound is not changed) for the 2nd half
3992  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
3993  if (ptask_dup != NULL) // construct fistprivates, etc.
3994  ptask_dup(next_task, task, 0);
3995  *ub = ub0; // adjust upper bound for the 1st half
3996 
3997  // create auxiliary task for 2nd half of the loop
3998  kmp_task_t *new_task =
3999  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4000  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4001  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4002  p->task = next_task;
4003  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4004  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4005  p->task_dup = task_dup;
4006  p->st = st;
4007  p->ub_glob = ub_glob;
4008  p->num_tasks = n_tsk1;
4009  p->grainsize = grainsize;
4010  p->extras = ext1;
4011  p->tc = tc1;
4012  p->num_t_min = num_t_min;
4013 #if OMPT_SUPPORT
4014  p->codeptr_ra = codeptr_ra;
4015 #endif
4016 
4017 #if OMPT_SUPPORT
4018  // schedule new task with correct return address for OMPT events
4019  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4020 #else
4021  __kmp_omp_task(gtid, new_task, true); // schedule new task
4022 #endif
4023 
4024  // execute the 1st half of current subrange
4025  if (n_tsk0 > num_t_min)
4026  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4027  ext0, tc0, num_t_min,
4028 #if OMPT_SUPPORT
4029  codeptr_ra,
4030 #endif
4031  task_dup);
4032  else
4033  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4034  gr_size0, ext0, tc0,
4035 #if OMPT_SUPPORT
4036  codeptr_ra,
4037 #endif
4038  task_dup);
4039 
4040  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4041 }
4042 
4059 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4060  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4061  int sched, kmp_uint64 grainsize, void *task_dup) {
4062  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4063  KMP_DEBUG_ASSERT(task != NULL);
4064 
4065  if (nogroup == 0) {
4066 #if OMPT_SUPPORT && OMPT_OPTIONAL
4067  OMPT_STORE_RETURN_ADDRESS(gtid);
4068 #endif
4069  __kmpc_taskgroup(loc, gtid);
4070  }
4071 
4072  // =========================================================================
4073  // calculate loop parameters
4074  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4075  kmp_uint64 tc;
4076  // compiler provides global bounds here
4077  kmp_uint64 lower = task_bounds.get_lb();
4078  kmp_uint64 upper = task_bounds.get_ub();
4079  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4080  kmp_uint64 num_tasks = 0, extras = 0;
4081  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4082  kmp_info_t *thread = __kmp_threads[gtid];
4083  kmp_taskdata_t *current_task = thread->th.th_current_task;
4084 
4085  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4086  "grain %llu(%d), dup %p\n",
4087  gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4088 
4089  // compute trip count
4090  if (st == 1) { // most common case
4091  tc = upper - lower + 1;
4092  } else if (st < 0) {
4093  tc = (lower - upper) / (-st) + 1;
4094  } else { // st > 0
4095  tc = (upper - lower) / st + 1;
4096  }
4097  if (tc == 0) {
4098  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4099  // free the pattern task and exit
4100  __kmp_task_start(gtid, task, current_task);
4101  // do not execute anything for zero-trip loop
4102  __kmp_task_finish<false>(gtid, task, current_task);
4103  return;
4104  }
4105 
4106 #if OMPT_SUPPORT && OMPT_OPTIONAL
4107  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4108  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4109  if (ompt_enabled.ompt_callback_work) {
4110  ompt_callbacks.ompt_callback(ompt_callback_work)(
4111  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4112  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4113  }
4114 #endif
4115 
4116  if (num_tasks_min == 0)
4117  // TODO: can we choose better default heuristic?
4118  num_tasks_min =
4119  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4120 
4121  // compute num_tasks/grainsize based on the input provided
4122  switch (sched) {
4123  case 0: // no schedule clause specified, we can choose the default
4124  // let's try to schedule (team_size*10) tasks
4125  grainsize = thread->th.th_team_nproc * 10;
4126  case 2: // num_tasks provided
4127  if (grainsize > tc) {
4128  num_tasks = tc; // too big num_tasks requested, adjust values
4129  grainsize = 1;
4130  extras = 0;
4131  } else {
4132  num_tasks = grainsize;
4133  grainsize = tc / num_tasks;
4134  extras = tc % num_tasks;
4135  }
4136  break;
4137  case 1: // grainsize provided
4138  if (grainsize > tc) {
4139  num_tasks = 1; // too big grainsize requested, adjust values
4140  grainsize = tc;
4141  extras = 0;
4142  } else {
4143  num_tasks = tc / grainsize;
4144  // adjust grainsize for balanced distribution of iterations
4145  grainsize = tc / num_tasks;
4146  extras = tc % num_tasks;
4147  }
4148  break;
4149  default:
4150  KMP_ASSERT2(0, "unknown scheduling of taskloop");
4151  }
4152  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4153  KMP_DEBUG_ASSERT(num_tasks > extras);
4154  KMP_DEBUG_ASSERT(num_tasks > 0);
4155  // =========================================================================
4156 
4157  // check if clause value first
4158  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4159  if (if_val == 0) { // if(0) specified, mark task as serial
4160  taskdata->td_flags.task_serial = 1;
4161  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4162  // always start serial tasks linearly
4163  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4164  grainsize, extras, tc,
4165 #if OMPT_SUPPORT
4166  OMPT_GET_RETURN_ADDRESS(0),
4167 #endif
4168  task_dup);
4169  // !taskdata->td_flags.native => currently force linear spawning of tasks
4170  // for GOMP_taskloop
4171  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4172  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4173  "(%lld), grain %llu, extras %llu\n",
4174  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4175  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4176  grainsize, extras, tc, num_tasks_min,
4177 #if OMPT_SUPPORT
4178  OMPT_GET_RETURN_ADDRESS(0),
4179 #endif
4180  task_dup);
4181  } else {
4182  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4183  "(%lld), grain %llu, extras %llu\n",
4184  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4185  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4186  grainsize, extras, tc,
4187 #if OMPT_SUPPORT
4188  OMPT_GET_RETURN_ADDRESS(0),
4189 #endif
4190  task_dup);
4191  }
4192 
4193 #if OMPT_SUPPORT && OMPT_OPTIONAL
4194  if (ompt_enabled.ompt_callback_work) {
4195  ompt_callbacks.ompt_callback(ompt_callback_work)(
4196  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4197  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4198  }
4199 #endif
4200 
4201  if (nogroup == 0) {
4202 #if OMPT_SUPPORT && OMPT_OPTIONAL
4203  OMPT_STORE_RETURN_ADDRESS(gtid);
4204 #endif
4205  __kmpc_end_taskgroup(loc, gtid);
4206  }
4207  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4208 }
4209 
4210 #endif
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:890
Definition: kmp.h:207