LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_i18n.h"
16 #include "kmp_itt.h"
17 #include "kmp_stats.h"
18 #include "kmp_wait_release.h"
19 #include "kmp_taskdeps.h"
20 
21 #if OMPT_SUPPORT
22 #include "ompt-specific.h"
23 #endif
24 
25 #include "tsan_annotations.h"
26 
27 /* forward declaration */
28 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
29  kmp_info_t *this_thr);
30 static void __kmp_alloc_task_deque(kmp_info_t *thread,
31  kmp_thread_data_t *thread_data);
32 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
33  kmp_task_team_t *task_team);
34 
35 #if OMP_45_ENABLED
36 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
37 #endif
38 
39 #ifdef BUILD_TIED_TASK_STACK
40 
41 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
42 // from top do bottom
43 //
44 // gtid: global thread identifier for thread containing stack
45 // thread_data: thread data for task team thread containing stack
46 // threshold: value above which the trace statement triggers
47 // location: string identifying call site of this function (for trace)
48 static void __kmp_trace_task_stack(kmp_int32 gtid,
49  kmp_thread_data_t *thread_data,
50  int threshold, char *location) {
51  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
52  kmp_taskdata_t **stack_top = task_stack->ts_top;
53  kmp_int32 entries = task_stack->ts_entries;
54  kmp_taskdata_t *tied_task;
55 
56  KA_TRACE(
57  threshold,
58  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
59  "first_block = %p, stack_top = %p \n",
60  location, gtid, entries, task_stack->ts_first_block, stack_top));
61 
62  KMP_DEBUG_ASSERT(stack_top != NULL);
63  KMP_DEBUG_ASSERT(entries > 0);
64 
65  while (entries != 0) {
66  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
67  // fix up ts_top if we need to pop from previous block
68  if (entries & TASK_STACK_INDEX_MASK == 0) {
69  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
70 
71  stack_block = stack_block->sb_prev;
72  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
73  }
74 
75  // finish bookkeeping
76  stack_top--;
77  entries--;
78 
79  tied_task = *stack_top;
80 
81  KMP_DEBUG_ASSERT(tied_task != NULL);
82  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
83 
84  KA_TRACE(threshold,
85  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
86  "stack_top=%p, tied_task=%p\n",
87  location, gtid, entries, stack_top, tied_task));
88  }
89  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
90 
91  KA_TRACE(threshold,
92  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
93  location, gtid));
94 }
95 
96 // __kmp_init_task_stack: initialize the task stack for the first time
97 // after a thread_data structure is created.
98 // It should not be necessary to do this again (assuming the stack works).
99 //
100 // gtid: global thread identifier of calling thread
101 // thread_data: thread data for task team thread containing stack
102 static void __kmp_init_task_stack(kmp_int32 gtid,
103  kmp_thread_data_t *thread_data) {
104  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
105  kmp_stack_block_t *first_block;
106 
107  // set up the first block of the stack
108  first_block = &task_stack->ts_first_block;
109  task_stack->ts_top = (kmp_taskdata_t **)first_block;
110  memset((void *)first_block, '\0',
111  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
112 
113  // initialize the stack to be empty
114  task_stack->ts_entries = TASK_STACK_EMPTY;
115  first_block->sb_next = NULL;
116  first_block->sb_prev = NULL;
117 }
118 
119 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
120 //
121 // gtid: global thread identifier for calling thread
122 // thread_data: thread info for thread containing stack
123 static void __kmp_free_task_stack(kmp_int32 gtid,
124  kmp_thread_data_t *thread_data) {
125  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
126  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
127 
128  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
129  // free from the second block of the stack
130  while (stack_block != NULL) {
131  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
132 
133  stack_block->sb_next = NULL;
134  stack_block->sb_prev = NULL;
135  if (stack_block != &task_stack->ts_first_block) {
136  __kmp_thread_free(thread,
137  stack_block); // free the block, if not the first
138  }
139  stack_block = next_block;
140  }
141  // initialize the stack to be empty
142  task_stack->ts_entries = 0;
143  task_stack->ts_top = NULL;
144 }
145 
146 // __kmp_push_task_stack: Push the tied task onto the task stack.
147 // Grow the stack if necessary by allocating another block.
148 //
149 // gtid: global thread identifier for calling thread
150 // thread: thread info for thread containing stack
151 // tied_task: the task to push on the stack
152 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
153  kmp_taskdata_t *tied_task) {
154  // GEH - need to consider what to do if tt_threads_data not allocated yet
155  kmp_thread_data_t *thread_data =
156  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
157  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
158 
159  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
160  return; // Don't push anything on stack if team or team tasks are serialized
161  }
162 
163  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
164  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
165 
166  KA_TRACE(20,
167  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
168  gtid, thread, tied_task));
169  // Store entry
170  *(task_stack->ts_top) = tied_task;
171 
172  // Do bookkeeping for next push
173  task_stack->ts_top++;
174  task_stack->ts_entries++;
175 
176  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
177  // Find beginning of this task block
178  kmp_stack_block_t *stack_block =
179  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
180 
181  // Check if we already have a block
182  if (stack_block->sb_next !=
183  NULL) { // reset ts_top to beginning of next block
184  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
185  } else { // Alloc new block and link it up
186  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
187  thread, sizeof(kmp_stack_block_t));
188 
189  task_stack->ts_top = &new_block->sb_block[0];
190  stack_block->sb_next = new_block;
191  new_block->sb_prev = stack_block;
192  new_block->sb_next = NULL;
193 
194  KA_TRACE(
195  30,
196  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
197  gtid, tied_task, new_block));
198  }
199  }
200  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
201  tied_task));
202 }
203 
204 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
205 // the task, just check to make sure it matches the ending task passed in.
206 //
207 // gtid: global thread identifier for the calling thread
208 // thread: thread info structure containing stack
209 // tied_task: the task popped off the stack
210 // ending_task: the task that is ending (should match popped task)
211 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
212  kmp_taskdata_t *ending_task) {
213  // GEH - need to consider what to do if tt_threads_data not allocated yet
214  kmp_thread_data_t *thread_data =
215  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
216  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
217  kmp_taskdata_t *tied_task;
218 
219  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
220  // Don't pop anything from stack if team or team tasks are serialized
221  return;
222  }
223 
224  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
225  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
226 
227  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
228  thread));
229 
230  // fix up ts_top if we need to pop from previous block
231  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
232  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
233 
234  stack_block = stack_block->sb_prev;
235  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
236  }
237 
238  // finish bookkeeping
239  task_stack->ts_top--;
240  task_stack->ts_entries--;
241 
242  tied_task = *(task_stack->ts_top);
243 
244  KMP_DEBUG_ASSERT(tied_task != NULL);
245  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
246  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
247 
248  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
249  tied_task));
250  return;
251 }
252 #endif /* BUILD_TIED_TASK_STACK */
253 
254 // returns 1 if new task is allowed to execute, 0 otherwise
255 // checks Task Scheduling constraint (if requested) and
256 // mutexinoutset dependencies if any
257 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
258  const kmp_taskdata_t *tasknew,
259  const kmp_taskdata_t *taskcurr) {
260  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
261  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
262  // only descendant of all deferred tied tasks can be scheduled, checking
263  // the last one is enough, as it in turn is the descendant of all others
264  kmp_taskdata_t *current = taskcurr->td_last_tied;
265  KMP_DEBUG_ASSERT(current != NULL);
266  // check if the task is not suspended on barrier
267  if (current->td_flags.tasktype == TASK_EXPLICIT ||
268  current->td_taskwait_thread > 0) { // <= 0 on barrier
269  kmp_int32 level = current->td_level;
270  kmp_taskdata_t *parent = tasknew->td_parent;
271  while (parent != current && parent->td_level > level) {
272  // check generation up to the level of the current task
273  parent = parent->td_parent;
274  KMP_DEBUG_ASSERT(parent != NULL);
275  }
276  if (parent != current)
277  return false;
278  }
279  }
280  // Check mutexinoutset dependencies, acquire locks
281  kmp_depnode_t *node = tasknew->td_depnode;
282  if (node && (node->dn.mtx_num_locks > 0)) {
283  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
284  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
285  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
286  continue;
287  // could not get the lock, release previous locks
288  for (int j = i - 1; j >= 0; --j)
289  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
290  return false;
291  }
292  // negative num_locks means all locks acquired successfully
293  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
294  }
295  return true;
296 }
297 
298 // __kmp_realloc_task_deque:
299 // Re-allocates a task deque for a particular thread, copies the content from
300 // the old deque and adjusts the necessary data structures relating to the
301 // deque. This operation must be done with the deque_lock being held
302 static void __kmp_realloc_task_deque(kmp_info_t *thread,
303  kmp_thread_data_t *thread_data) {
304  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
305  kmp_int32 new_size = 2 * size;
306 
307  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
308  "%d] for thread_data %p\n",
309  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
310 
311  kmp_taskdata_t **new_deque =
312  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
313 
314  int i, j;
315  for (i = thread_data->td.td_deque_head, j = 0; j < size;
316  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
317  new_deque[j] = thread_data->td.td_deque[i];
318 
319  __kmp_free(thread_data->td.td_deque);
320 
321  thread_data->td.td_deque_head = 0;
322  thread_data->td.td_deque_tail = size;
323  thread_data->td.td_deque = new_deque;
324  thread_data->td.td_deque_size = new_size;
325 }
326 
327 // __kmp_push_task: Add a task to the thread's deque
328 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
329  kmp_info_t *thread = __kmp_threads[gtid];
330  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
331  kmp_task_team_t *task_team = thread->th.th_task_team;
332  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
333  kmp_thread_data_t *thread_data;
334 
335  KA_TRACE(20,
336  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
337 
338  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
339  // untied task needs to increment counter so that the task structure is not
340  // freed prematurely
341  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
342  KMP_DEBUG_USE_VAR(counter);
343  KA_TRACE(
344  20,
345  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
346  gtid, counter, taskdata));
347  }
348 
349  // The first check avoids building task_team thread data if serialized
350  if (taskdata->td_flags.task_serial) {
351  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
352  "TASK_NOT_PUSHED for task %p\n",
353  gtid, taskdata));
354  return TASK_NOT_PUSHED;
355  }
356 
357  // Now that serialized tasks have returned, we can assume that we are not in
358  // immediate exec mode
359  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
360  if (!KMP_TASKING_ENABLED(task_team)) {
361  __kmp_enable_tasking(task_team, thread);
362  }
363  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
364  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
365 
366  // Find tasking deque specific to encountering thread
367  thread_data = &task_team->tt.tt_threads_data[tid];
368 
369  // No lock needed since only owner can allocate
370  if (thread_data->td.td_deque == NULL) {
371  __kmp_alloc_task_deque(thread, thread_data);
372  }
373 
374  int locked = 0;
375  // Check if deque is full
376  if (TCR_4(thread_data->td.td_deque_ntasks) >=
377  TASK_DEQUE_SIZE(thread_data->td)) {
378  if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
379  thread->th.th_current_task)) {
380  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
381  "TASK_NOT_PUSHED for task %p\n",
382  gtid, taskdata));
383  return TASK_NOT_PUSHED;
384  } else {
385  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
386  locked = 1;
387  // expand deque to push the task which is not allowed to execute
388  __kmp_realloc_task_deque(thread, thread_data);
389  }
390  }
391  // Lock the deque for the task push operation
392  if (!locked) {
393  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
394 #if OMP_45_ENABLED
395  // Need to recheck as we can get a proxy task from thread outside of OpenMP
396  if (TCR_4(thread_data->td.td_deque_ntasks) >=
397  TASK_DEQUE_SIZE(thread_data->td)) {
398  if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
399  thread->th.th_current_task)) {
400  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
401  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
402  "returning TASK_NOT_PUSHED for task %p\n",
403  gtid, taskdata));
404  return TASK_NOT_PUSHED;
405  } else {
406  // expand deque to push the task which is not allowed to execute
407  __kmp_realloc_task_deque(thread, thread_data);
408  }
409  }
410 #endif
411  }
412  // Must have room since no thread can add tasks but calling thread
413  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
414  TASK_DEQUE_SIZE(thread_data->td));
415 
416  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
417  taskdata; // Push taskdata
418  // Wrap index.
419  thread_data->td.td_deque_tail =
420  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
421  TCW_4(thread_data->td.td_deque_ntasks,
422  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
423 
424  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
425  "task=%p ntasks=%d head=%u tail=%u\n",
426  gtid, taskdata, thread_data->td.td_deque_ntasks,
427  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
428 
429  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
430 
431  return TASK_SUCCESSFULLY_PUSHED;
432 }
433 
434 // __kmp_pop_current_task_from_thread: set up current task from called thread
435 // when team ends
436 //
437 // this_thr: thread structure to set current_task in.
438 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
439  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
440  "this_thread=%p, curtask=%p, "
441  "curtask_parent=%p\n",
442  0, this_thr, this_thr->th.th_current_task,
443  this_thr->th.th_current_task->td_parent));
444 
445  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
446 
447  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
448  "this_thread=%p, curtask=%p, "
449  "curtask_parent=%p\n",
450  0, this_thr, this_thr->th.th_current_task,
451  this_thr->th.th_current_task->td_parent));
452 }
453 
454 // __kmp_push_current_task_to_thread: set up current task in called thread for a
455 // new team
456 //
457 // this_thr: thread structure to set up
458 // team: team for implicit task data
459 // tid: thread within team to set up
460 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
461  int tid) {
462  // current task of the thread is a parent of the new just created implicit
463  // tasks of new team
464  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
465  "curtask=%p "
466  "parent_task=%p\n",
467  tid, this_thr, this_thr->th.th_current_task,
468  team->t.t_implicit_task_taskdata[tid].td_parent));
469 
470  KMP_DEBUG_ASSERT(this_thr != NULL);
471 
472  if (tid == 0) {
473  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
474  team->t.t_implicit_task_taskdata[0].td_parent =
475  this_thr->th.th_current_task;
476  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
477  }
478  } else {
479  team->t.t_implicit_task_taskdata[tid].td_parent =
480  team->t.t_implicit_task_taskdata[0].td_parent;
481  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
482  }
483 
484  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
485  "curtask=%p "
486  "parent_task=%p\n",
487  tid, this_thr, this_thr->th.th_current_task,
488  team->t.t_implicit_task_taskdata[tid].td_parent));
489 }
490 
491 // __kmp_task_start: bookkeeping for a task starting execution
492 //
493 // GTID: global thread id of calling thread
494 // task: task starting execution
495 // current_task: task suspending
496 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
497  kmp_taskdata_t *current_task) {
498  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
499  kmp_info_t *thread = __kmp_threads[gtid];
500 
501  KA_TRACE(10,
502  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
503  gtid, taskdata, current_task));
504 
505  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
506 
507  // mark currently executing task as suspended
508  // TODO: GEH - make sure root team implicit task is initialized properly.
509  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
510  current_task->td_flags.executing = 0;
511 
512 // Add task to stack if tied
513 #ifdef BUILD_TIED_TASK_STACK
514  if (taskdata->td_flags.tiedness == TASK_TIED) {
515  __kmp_push_task_stack(gtid, thread, taskdata);
516  }
517 #endif /* BUILD_TIED_TASK_STACK */
518 
519  // mark starting task as executing and as current task
520  thread->th.th_current_task = taskdata;
521 
522  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
523  taskdata->td_flags.tiedness == TASK_UNTIED);
524  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
525  taskdata->td_flags.tiedness == TASK_UNTIED);
526  taskdata->td_flags.started = 1;
527  taskdata->td_flags.executing = 1;
528  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
529  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
530 
531  // GEH TODO: shouldn't we pass some sort of location identifier here?
532  // APT: yes, we will pass location here.
533  // need to store current thread state (in a thread or taskdata structure)
534  // before setting work_state, otherwise wrong state is set after end of task
535 
536  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
537 
538  return;
539 }
540 
541 #if OMPT_SUPPORT
542 //------------------------------------------------------------------------------
543 // __ompt_task_init:
544 // Initialize OMPT fields maintained by a task. This will only be called after
545 // ompt_start_tool, so we already know whether ompt is enabled or not.
546 
547 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
548  // The calls to __ompt_task_init already have the ompt_enabled condition.
549  task->ompt_task_info.task_data.value = 0;
550  task->ompt_task_info.frame.exit_frame = ompt_data_none;
551  task->ompt_task_info.frame.enter_frame = ompt_data_none;
552  task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
553  task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
554 #if OMP_40_ENABLED
555  task->ompt_task_info.ndeps = 0;
556  task->ompt_task_info.deps = NULL;
557 #endif /* OMP_40_ENABLED */
558 }
559 
560 // __ompt_task_start:
561 // Build and trigger task-begin event
562 static inline void __ompt_task_start(kmp_task_t *task,
563  kmp_taskdata_t *current_task,
564  kmp_int32 gtid) {
565  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
566  ompt_task_status_t status = ompt_task_switch;
567  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
568  status = ompt_task_yield;
569  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
570  }
571  /* let OMPT know that we're about to run this task */
572  if (ompt_enabled.ompt_callback_task_schedule) {
573  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
574  &(current_task->ompt_task_info.task_data), status,
575  &(taskdata->ompt_task_info.task_data));
576  }
577  taskdata->ompt_task_info.scheduling_parent = current_task;
578 }
579 
580 // __ompt_task_finish:
581 // Build and trigger final task-schedule event
582 static inline void
583 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
584  ompt_task_status_t status = ompt_task_complete) {
585  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
586  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
587  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
588  status = ompt_task_cancel;
589  }
590 
591  /* let OMPT know that we're returning to the callee task */
592  if (ompt_enabled.ompt_callback_task_schedule) {
593  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
594  &(taskdata->ompt_task_info.task_data), status,
595  &((resumed_task ? resumed_task
596  : (taskdata->ompt_task_info.scheduling_parent
597  ? taskdata->ompt_task_info.scheduling_parent
598  : taskdata->td_parent))
599  ->ompt_task_info.task_data));
600  }
601 }
602 #endif
603 
604 template <bool ompt>
605 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
606  kmp_task_t *task,
607  void *frame_address,
608  void *return_address) {
609  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
610  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
611 
612  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
613  "current_task=%p\n",
614  gtid, loc_ref, taskdata, current_task));
615 
616  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
617  // untied task needs to increment counter so that the task structure is not
618  // freed prematurely
619  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
620  KMP_DEBUG_USE_VAR(counter);
621  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
622  "incremented for task %p\n",
623  gtid, counter, taskdata));
624  }
625 
626  taskdata->td_flags.task_serial =
627  1; // Execute this task immediately, not deferred.
628  __kmp_task_start(gtid, task, current_task);
629 
630 #if OMPT_SUPPORT
631  if (ompt) {
632  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
633  current_task->ompt_task_info.frame.enter_frame.ptr =
634  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
635  current_task->ompt_task_info.frame.enter_frame_flags =
636  taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
637  }
638  if (ompt_enabled.ompt_callback_task_create) {
639  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
640  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
641  &(parent_info->task_data), &(parent_info->frame),
642  &(taskdata->ompt_task_info.task_data),
643  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
644  return_address);
645  }
646  __ompt_task_start(task, current_task, gtid);
647  }
648 #endif // OMPT_SUPPORT
649 
650  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
651  loc_ref, taskdata));
652 }
653 
654 #if OMPT_SUPPORT
655 OMPT_NOINLINE
656 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
657  kmp_task_t *task,
658  void *frame_address,
659  void *return_address) {
660  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
661  return_address);
662 }
663 #endif // OMPT_SUPPORT
664 
665 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
666 // execution
667 //
668 // loc_ref: source location information; points to beginning of task block.
669 // gtid: global thread number.
670 // task: task thunk for the started task.
671 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
672  kmp_task_t *task) {
673 #if OMPT_SUPPORT
674  if (UNLIKELY(ompt_enabled.enabled)) {
675  OMPT_STORE_RETURN_ADDRESS(gtid);
676  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
677  OMPT_GET_FRAME_ADDRESS(1),
678  OMPT_LOAD_RETURN_ADDRESS(gtid));
679  return;
680  }
681 #endif
682  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
683 }
684 
685 #ifdef TASK_UNUSED
686 // __kmpc_omp_task_begin: report that a given task has started execution
687 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
688 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
689  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
690 
691  KA_TRACE(
692  10,
693  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
694  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
695 
696  __kmp_task_start(gtid, task, current_task);
697 
698  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
699  loc_ref, KMP_TASK_TO_TASKDATA(task)));
700  return;
701 }
702 #endif // TASK_UNUSED
703 
704 // __kmp_free_task: free the current task space and the space for shareds
705 //
706 // gtid: Global thread ID of calling thread
707 // taskdata: task to free
708 // thread: thread data structure of caller
709 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
710  kmp_info_t *thread) {
711  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
712  taskdata));
713 
714  // Check to make sure all flags and counters have the correct values
715  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
716  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
717  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
718  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
719  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
720  taskdata->td_flags.task_serial == 1);
721  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
722 
723  taskdata->td_flags.freed = 1;
724  ANNOTATE_HAPPENS_BEFORE(taskdata);
725 // deallocate the taskdata and shared variable blocks associated with this task
726 #if USE_FAST_MEMORY
727  __kmp_fast_free(thread, taskdata);
728 #else /* ! USE_FAST_MEMORY */
729  __kmp_thread_free(thread, taskdata);
730 #endif
731 
732  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
733 }
734 
735 // __kmp_free_task_and_ancestors: free the current task and ancestors without
736 // children
737 //
738 // gtid: Global thread ID of calling thread
739 // taskdata: task to free
740 // thread: thread data structure of caller
741 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
742  kmp_taskdata_t *taskdata,
743  kmp_info_t *thread) {
744 #if OMP_45_ENABLED
745  // Proxy tasks must always be allowed to free their parents
746  // because they can be run in background even in serial mode.
747  kmp_int32 team_serial =
748  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
749  !taskdata->td_flags.proxy;
750 #else
751  kmp_int32 team_serial =
752  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
753 #endif
754  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
755 
756  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
757  KMP_DEBUG_ASSERT(children >= 0);
758 
759  // Now, go up the ancestor tree to see if any ancestors can now be freed.
760  while (children == 0) {
761  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
762 
763  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
764  "and freeing itself\n",
765  gtid, taskdata));
766 
767  // --- Deallocate my ancestor task ---
768  __kmp_free_task(gtid, taskdata, thread);
769 
770  taskdata = parent_taskdata;
771 
772  if (team_serial)
773  return;
774  // Stop checking ancestors at implicit task instead of walking up ancestor
775  // tree to avoid premature deallocation of ancestors.
776  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
777  if (taskdata->td_dephash) { // do we need to cleanup dephash?
778  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
779  kmp_tasking_flags_t flags_old = taskdata->td_flags;
780  if (children == 0 && flags_old.complete == 1) {
781  kmp_tasking_flags_t flags_new = flags_old;
782  flags_new.complete = 0;
783  if (KMP_COMPARE_AND_STORE_ACQ32(
784  RCAST(kmp_int32 *, &taskdata->td_flags),
785  *RCAST(kmp_int32 *, &flags_old),
786  *RCAST(kmp_int32 *, &flags_new))) {
787  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
788  "dephash of implicit task %p\n",
789  gtid, taskdata));
790  // cleanup dephash of finished implicit task
791  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
792  }
793  }
794  }
795  return;
796  }
797  // Predecrement simulated by "- 1" calculation
798  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
799  KMP_DEBUG_ASSERT(children >= 0);
800  }
801 
802  KA_TRACE(
803  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
804  "not freeing it yet\n",
805  gtid, taskdata, children));
806 }
807 
808 // __kmp_task_finish: bookkeeping to do when a task finishes execution
809 //
810 // gtid: global thread ID for calling thread
811 // task: task to be finished
812 // resumed_task: task to be resumed. (may be NULL if task is serialized)
813 template <bool ompt>
814 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
815  kmp_taskdata_t *resumed_task) {
816  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
817  kmp_info_t *thread = __kmp_threads[gtid];
818 #if OMP_45_ENABLED
819  kmp_task_team_t *task_team =
820  thread->th.th_task_team; // might be NULL for serial teams...
821 #endif // OMP_45_ENABLED
822  kmp_int32 children = 0;
823 
824  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
825  "task %p\n",
826  gtid, taskdata, resumed_task));
827 
828  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
829 
830 // Pop task from stack if tied
831 #ifdef BUILD_TIED_TASK_STACK
832  if (taskdata->td_flags.tiedness == TASK_TIED) {
833  __kmp_pop_task_stack(gtid, thread, taskdata);
834  }
835 #endif /* BUILD_TIED_TASK_STACK */
836 
837  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
838  // untied task needs to check the counter so that the task structure is not
839  // freed prematurely
840  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
841  KA_TRACE(
842  20,
843  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
844  gtid, counter, taskdata));
845  if (counter > 0) {
846  // untied task is not done, to be continued possibly by other thread, do
847  // not free it now
848  if (resumed_task == NULL) {
849  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
850  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
851  // task is the parent
852  }
853  thread->th.th_current_task = resumed_task; // restore current_task
854  resumed_task->td_flags.executing = 1; // resume previous task
855  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
856  "resuming task %p\n",
857  gtid, taskdata, resumed_task));
858  return;
859  }
860  }
861 #if OMPT_SUPPORT
862  if (ompt)
863  __ompt_task_finish(task, resumed_task);
864 #endif
865 
866  // Check mutexinoutset dependencies, release locks
867  kmp_depnode_t *node = taskdata->td_depnode;
868  if (node && (node->dn.mtx_num_locks < 0)) {
869  // negative num_locks means all locks were acquired
870  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
871  for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
872  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
873  __kmp_release_lock(node->dn.mtx_locks[i], gtid);
874  }
875  }
876 
877  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
878  taskdata->td_flags.complete = 1; // mark the task as completed
879  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
880  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
881 
882  // Only need to keep track of count if team parallel and tasking not
883  // serialized
884  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
885  // Predecrement simulated by "- 1" calculation
886  children =
887  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
888  KMP_DEBUG_ASSERT(children >= 0);
889 #if OMP_40_ENABLED
890  if (taskdata->td_taskgroup)
891  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
892  __kmp_release_deps(gtid, taskdata);
893 #if OMP_45_ENABLED
894  } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
895  // if we found proxy tasks there could exist a dependency chain
896  // with the proxy task as origin
897  __kmp_release_deps(gtid, taskdata);
898 #endif // OMP_45_ENABLED
899 #endif // OMP_40_ENABLED
900  }
901 
902  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
903  // called. Othertwise, if a task is executed immediately from the release_deps
904  // code, the flag will be reset to 1 again by this same function
905  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
906  taskdata->td_flags.executing = 0; // suspend the finishing task
907 
908  KA_TRACE(
909  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
910  gtid, taskdata, children));
911 
912 #if OMP_40_ENABLED
913  /* If the tasks' destructor thunk flag has been set, we need to invoke the
914  destructor thunk that has been generated by the compiler. The code is
915  placed here, since at this point other tasks might have been released
916  hence overlapping the destructor invokations with some other work in the
917  released tasks. The OpenMP spec is not specific on when the destructors
918  are invoked, so we should be free to choose. */
919  if (taskdata->td_flags.destructors_thunk) {
920  kmp_routine_entry_t destr_thunk = task->data1.destructors;
921  KMP_ASSERT(destr_thunk);
922  destr_thunk(gtid, task);
923  }
924 #endif // OMP_40_ENABLED
925 
926  // bookkeeping for resuming task:
927  // GEH - note tasking_ser => task_serial
928  KMP_DEBUG_ASSERT(
929  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
930  taskdata->td_flags.task_serial);
931  if (taskdata->td_flags.task_serial) {
932  if (resumed_task == NULL) {
933  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
934  // task is the parent
935  }
936  } else {
937  KMP_DEBUG_ASSERT(resumed_task !=
938  NULL); // verify that resumed task is passed as arguemnt
939  }
940 
941  // Free this task and then ancestor tasks if they have no children.
942  // Restore th_current_task first as suggested by John:
943  // johnmc: if an asynchronous inquiry peers into the runtime system
944  // it doesn't see the freed task as the current task.
945  thread->th.th_current_task = resumed_task;
946  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
947 
948  // TODO: GEH - make sure root team implicit task is initialized properly.
949  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
950  resumed_task->td_flags.executing = 1; // resume previous task
951 
952  KA_TRACE(
953  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
954  gtid, taskdata, resumed_task));
955 
956  return;
957 }
958 
959 template <bool ompt>
960 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
961  kmp_int32 gtid,
962  kmp_task_t *task) {
963  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
964  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
965  // this routine will provide task to resume
966  __kmp_task_finish<ompt>(gtid, task, NULL);
967 
968  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
969  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
970 
971 #if OMPT_SUPPORT
972  if (ompt) {
973  ompt_frame_t *ompt_frame;
974  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
975  ompt_frame->enter_frame = ompt_data_none;
976  ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
977  }
978 #endif
979 
980  return;
981 }
982 
983 #if OMPT_SUPPORT
984 OMPT_NOINLINE
985 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
986  kmp_task_t *task) {
987  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
988 }
989 #endif // OMPT_SUPPORT
990 
991 // __kmpc_omp_task_complete_if0: report that a task has completed execution
992 //
993 // loc_ref: source location information; points to end of task block.
994 // gtid: global thread number.
995 // task: task thunk for the completed task.
996 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
997  kmp_task_t *task) {
998 #if OMPT_SUPPORT
999  if (UNLIKELY(ompt_enabled.enabled)) {
1000  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1001  return;
1002  }
1003 #endif
1004  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1005 }
1006 
1007 #ifdef TASK_UNUSED
1008 // __kmpc_omp_task_complete: report that a task has completed execution
1009 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1010 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1011  kmp_task_t *task) {
1012  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1013  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1014 
1015  __kmp_task_finish<false>(gtid, task,
1016  NULL); // Not sure how to find task to resume
1017 
1018  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1019  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1020  return;
1021 }
1022 #endif // TASK_UNUSED
1023 
1024 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1025 // task for a given thread
1026 //
1027 // loc_ref: reference to source location of parallel region
1028 // this_thr: thread data structure corresponding to implicit task
1029 // team: team for this_thr
1030 // tid: thread id of given thread within team
1031 // set_curr_task: TRUE if need to push current task to thread
1032 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1033 // have already been done elsewhere.
1034 // TODO: Get better loc_ref. Value passed in may be NULL
1035 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1036  kmp_team_t *team, int tid, int set_curr_task) {
1037  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1038 
1039  KF_TRACE(
1040  10,
1041  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1042  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1043 
1044  task->td_task_id = KMP_GEN_TASK_ID();
1045  task->td_team = team;
1046  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1047  // in debugger)
1048  task->td_ident = loc_ref;
1049  task->td_taskwait_ident = NULL;
1050  task->td_taskwait_counter = 0;
1051  task->td_taskwait_thread = 0;
1052 
1053  task->td_flags.tiedness = TASK_TIED;
1054  task->td_flags.tasktype = TASK_IMPLICIT;
1055 #if OMP_45_ENABLED
1056  task->td_flags.proxy = TASK_FULL;
1057 #endif
1058 
1059  // All implicit tasks are executed immediately, not deferred
1060  task->td_flags.task_serial = 1;
1061  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1062  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1063 
1064  task->td_flags.started = 1;
1065  task->td_flags.executing = 1;
1066  task->td_flags.complete = 0;
1067  task->td_flags.freed = 0;
1068 
1069 #if OMP_40_ENABLED
1070  task->td_depnode = NULL;
1071 #endif
1072  task->td_last_tied = task;
1073 
1074  if (set_curr_task) { // only do this init first time thread is created
1075  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1076  // Not used: don't need to deallocate implicit task
1077  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1078 #if OMP_40_ENABLED
1079  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1080  task->td_dephash = NULL;
1081 #endif
1082  __kmp_push_current_task_to_thread(this_thr, team, tid);
1083  } else {
1084  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1085  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1086  }
1087 
1088 #if OMPT_SUPPORT
1089  if (UNLIKELY(ompt_enabled.enabled))
1090  __ompt_task_init(task, tid);
1091 #endif
1092 
1093  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1094  team, task));
1095 }
1096 
1097 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1098 // at the end of parallel regions. Some resources are kept for reuse in the next
1099 // parallel region.
1100 //
1101 // thread: thread data structure corresponding to implicit task
1102 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1103  kmp_taskdata_t *task = thread->th.th_current_task;
1104  if (task->td_dephash) {
1105  int children;
1106  task->td_flags.complete = 1;
1107  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1108  kmp_tasking_flags_t flags_old = task->td_flags;
1109  if (children == 0 && flags_old.complete == 1) {
1110  kmp_tasking_flags_t flags_new = flags_old;
1111  flags_new.complete = 0;
1112  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1113  *RCAST(kmp_int32 *, &flags_old),
1114  *RCAST(kmp_int32 *, &flags_new))) {
1115  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1116  "dephash of implicit task %p\n",
1117  thread->th.th_info.ds.ds_gtid, task));
1118  __kmp_dephash_free_entries(thread, task->td_dephash);
1119  }
1120  }
1121  }
1122 }
1123 
1124 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1125 // when these are destroyed regions
1126 //
1127 // thread: thread data structure corresponding to implicit task
1128 void __kmp_free_implicit_task(kmp_info_t *thread) {
1129  kmp_taskdata_t *task = thread->th.th_current_task;
1130  if (task && task->td_dephash) {
1131  __kmp_dephash_free(thread, task->td_dephash);
1132  task->td_dephash = NULL;
1133  }
1134 }
1135 
1136 // Round up a size to a power of two specified by val: Used to insert padding
1137 // between structures co-allocated using a single malloc() call
1138 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1139  if (size & (val - 1)) {
1140  size &= ~(val - 1);
1141  if (size <= KMP_SIZE_T_MAX - val) {
1142  size += val; // Round up if there is no overflow.
1143  }
1144  }
1145  return size;
1146 } // __kmp_round_up_to_va
1147 
1148 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1149 //
1150 // loc_ref: source location information
1151 // gtid: global thread number.
1152 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1153 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1154 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1155 // private vars accessed in task.
1156 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1157 // in task.
1158 // task_entry: Pointer to task code entry point generated by compiler.
1159 // returns: a pointer to the allocated kmp_task_t structure (task).
1160 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1161  kmp_tasking_flags_t *flags,
1162  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1163  kmp_routine_entry_t task_entry) {
1164  kmp_task_t *task;
1165  kmp_taskdata_t *taskdata;
1166  kmp_info_t *thread = __kmp_threads[gtid];
1167  kmp_team_t *team = thread->th.th_team;
1168  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1169  size_t shareds_offset;
1170 
1171  if (!TCR_4(__kmp_init_middle))
1172  __kmp_middle_initialize();
1173 
1174  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1175  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1176  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1177  sizeof_shareds, task_entry));
1178 
1179  if (parent_task->td_flags.final) {
1180  if (flags->merged_if0) {
1181  }
1182  flags->final = 1;
1183  }
1184  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1185  // Untied task encountered causes the TSC algorithm to check entire deque of
1186  // the victim thread. If no untied task encountered, then checking the head
1187  // of the deque should be enough.
1188  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1189  }
1190 
1191 #if OMP_45_ENABLED
1192  if (flags->proxy == TASK_PROXY) {
1193  flags->tiedness = TASK_UNTIED;
1194  flags->merged_if0 = 1;
1195 
1196  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1197  tasking support enabled */
1198  if ((thread->th.th_task_team) == NULL) {
1199  /* This should only happen if the team is serialized
1200  setup a task team and propagate it to the thread */
1201  KMP_DEBUG_ASSERT(team->t.t_serialized);
1202  KA_TRACE(30,
1203  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1204  gtid));
1205  __kmp_task_team_setup(
1206  thread, team,
1207  1); // 1 indicates setup the current team regardless of nthreads
1208  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1209  }
1210  kmp_task_team_t *task_team = thread->th.th_task_team;
1211 
1212  /* tasking must be enabled now as the task might not be pushed */
1213  if (!KMP_TASKING_ENABLED(task_team)) {
1214  KA_TRACE(
1215  30,
1216  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1217  __kmp_enable_tasking(task_team, thread);
1218  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1219  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1220  // No lock needed since only owner can allocate
1221  if (thread_data->td.td_deque == NULL) {
1222  __kmp_alloc_task_deque(thread, thread_data);
1223  }
1224  }
1225 
1226  if (task_team->tt.tt_found_proxy_tasks == FALSE)
1227  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1228  }
1229 #endif
1230 
1231  // Calculate shared structure offset including padding after kmp_task_t struct
1232  // to align pointers in shared struct
1233  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1234  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1235 
1236  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1237  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1238  shareds_offset));
1239  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1240  sizeof_shareds));
1241 
1242 // Avoid double allocation here by combining shareds with taskdata
1243 #if USE_FAST_MEMORY
1244  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1245  sizeof_shareds);
1246 #else /* ! USE_FAST_MEMORY */
1247  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1248  sizeof_shareds);
1249 #endif /* USE_FAST_MEMORY */
1250  ANNOTATE_HAPPENS_AFTER(taskdata);
1251 
1252  task = KMP_TASKDATA_TO_TASK(taskdata);
1253 
1254 // Make sure task & taskdata are aligned appropriately
1255 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1256  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1257  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1258 #else
1259  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1260  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1261 #endif
1262  if (sizeof_shareds > 0) {
1263  // Avoid double allocation here by combining shareds with taskdata
1264  task->shareds = &((char *)taskdata)[shareds_offset];
1265  // Make sure shareds struct is aligned to pointer size
1266  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1267  0);
1268  } else {
1269  task->shareds = NULL;
1270  }
1271  task->routine = task_entry;
1272  task->part_id = 0; // AC: Always start with 0 part id
1273 
1274  taskdata->td_task_id = KMP_GEN_TASK_ID();
1275  taskdata->td_team = team;
1276  taskdata->td_alloc_thread = thread;
1277  taskdata->td_parent = parent_task;
1278  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1279  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1280  taskdata->td_ident = loc_ref;
1281  taskdata->td_taskwait_ident = NULL;
1282  taskdata->td_taskwait_counter = 0;
1283  taskdata->td_taskwait_thread = 0;
1284  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1285 #if OMP_45_ENABLED
1286  // avoid copying icvs for proxy tasks
1287  if (flags->proxy == TASK_FULL)
1288 #endif
1289  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1290 
1291  taskdata->td_flags.tiedness = flags->tiedness;
1292  taskdata->td_flags.final = flags->final;
1293  taskdata->td_flags.merged_if0 = flags->merged_if0;
1294 #if OMP_40_ENABLED
1295  taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1296 #endif // OMP_40_ENABLED
1297 #if OMP_45_ENABLED
1298  taskdata->td_flags.proxy = flags->proxy;
1299  taskdata->td_task_team = thread->th.th_task_team;
1300  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1301 #endif
1302  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1303 
1304  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1305  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1306 
1307  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1308  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1309 
1310  // GEH - Note we serialize the task if the team is serialized to make sure
1311  // implicit parallel region tasks are not left until program termination to
1312  // execute. Also, it helps locality to execute immediately.
1313 
1314  taskdata->td_flags.task_serial =
1315  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1316  taskdata->td_flags.tasking_ser);
1317 
1318  taskdata->td_flags.started = 0;
1319  taskdata->td_flags.executing = 0;
1320  taskdata->td_flags.complete = 0;
1321  taskdata->td_flags.freed = 0;
1322 
1323  taskdata->td_flags.native = flags->native;
1324 
1325  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1326  // start at one because counts current task and children
1327  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1328 #if OMP_40_ENABLED
1329  taskdata->td_taskgroup =
1330  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1331  taskdata->td_dephash = NULL;
1332  taskdata->td_depnode = NULL;
1333 #endif
1334  if (flags->tiedness == TASK_UNTIED)
1335  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1336  else
1337  taskdata->td_last_tied = taskdata;
1338 
1339 #if OMPT_SUPPORT
1340  if (UNLIKELY(ompt_enabled.enabled))
1341  __ompt_task_init(taskdata, gtid);
1342 #endif
1343 // Only need to keep track of child task counts if team parallel and tasking not
1344 // serialized or if it is a proxy task
1345 #if OMP_45_ENABLED
1346  if (flags->proxy == TASK_PROXY ||
1347  !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1348 #else
1349  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1350 #endif
1351  {
1352  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1353 #if OMP_40_ENABLED
1354  if (parent_task->td_taskgroup)
1355  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1356 #endif
1357  // Only need to keep track of allocated child tasks for explicit tasks since
1358  // implicit not deallocated
1359  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1360  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1361  }
1362  }
1363 
1364  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1365  gtid, taskdata, taskdata->td_parent));
1366  ANNOTATE_HAPPENS_BEFORE(task);
1367 
1368  return task;
1369 }
1370 
1371 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1372  kmp_int32 flags, size_t sizeof_kmp_task_t,
1373  size_t sizeof_shareds,
1374  kmp_routine_entry_t task_entry) {
1375  kmp_task_t *retval;
1376  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1377 
1378  input_flags->native = FALSE;
1379 // __kmp_task_alloc() sets up all other runtime flags
1380 
1381 #if OMP_45_ENABLED
1382  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1383  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1384  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1385  input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t,
1386  sizeof_shareds, task_entry));
1387 #else
1388  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1389  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1390  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1391  sizeof_kmp_task_t, sizeof_shareds, task_entry));
1392 #endif
1393 
1394  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1395  sizeof_shareds, task_entry);
1396 
1397  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1398 
1399  return retval;
1400 }
1401 
1402 #if OMP_50_ENABLED
1403 
1416 kmp_int32
1417 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1418  kmp_task_t *new_task, kmp_int32 naffins,
1419  kmp_task_affinity_info_t *affin_list) {
1420  return 0;
1421 }
1422 #endif
1423 
1424 // __kmp_invoke_task: invoke the specified task
1425 //
1426 // gtid: global thread ID of caller
1427 // task: the task to invoke
1428 // current_task: the task to resume after task invokation
1429 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1430  kmp_taskdata_t *current_task) {
1431  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1432  kmp_info_t *thread;
1433 #if OMP_40_ENABLED
1434  int discard = 0 /* false */;
1435 #endif
1436  KA_TRACE(
1437  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1438  gtid, taskdata, current_task));
1439  KMP_DEBUG_ASSERT(task);
1440 #if OMP_45_ENABLED
1441  if (taskdata->td_flags.proxy == TASK_PROXY &&
1442  taskdata->td_flags.complete == 1) {
1443  // This is a proxy task that was already completed but it needs to run
1444  // its bottom-half finish
1445  KA_TRACE(
1446  30,
1447  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1448  gtid, taskdata));
1449 
1450  __kmp_bottom_half_finish_proxy(gtid, task);
1451 
1452  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1453  "proxy task %p, resuming task %p\n",
1454  gtid, taskdata, current_task));
1455 
1456  return;
1457  }
1458 #endif
1459 
1460 #if OMPT_SUPPORT
1461  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1462  // does not execute code.
1463  ompt_thread_info_t oldInfo;
1464  if (UNLIKELY(ompt_enabled.enabled)) {
1465  // Store the threads states and restore them after the task
1466  thread = __kmp_threads[gtid];
1467  oldInfo = thread->th.ompt_thread_info;
1468  thread->th.ompt_thread_info.wait_id = 0;
1469  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1470  ? ompt_state_work_serial
1471  : ompt_state_work_parallel;
1472  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1473  }
1474 #endif
1475 
1476 #if OMP_45_ENABLED
1477  // Proxy tasks are not handled by the runtime
1478  if (taskdata->td_flags.proxy != TASK_PROXY) {
1479 #endif
1480  ANNOTATE_HAPPENS_AFTER(task);
1481  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1482 #if OMP_45_ENABLED
1483  }
1484 #endif
1485 
1486 #if OMP_40_ENABLED
1487  // TODO: cancel tasks if the parallel region has also been cancelled
1488  // TODO: check if this sequence can be hoisted above __kmp_task_start
1489  // if cancellation has been enabled for this run ...
1490  if (__kmp_omp_cancellation) {
1491  thread = __kmp_threads[gtid];
1492  kmp_team_t *this_team = thread->th.th_team;
1493  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1494  if ((taskgroup && taskgroup->cancel_request) ||
1495  (this_team->t.t_cancel_request == cancel_parallel)) {
1496 #if OMPT_SUPPORT && OMPT_OPTIONAL
1497  ompt_data_t *task_data;
1498  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1499  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1500  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1501  task_data,
1502  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1503  : ompt_cancel_parallel) |
1504  ompt_cancel_discarded_task,
1505  NULL);
1506  }
1507 #endif
1508  KMP_COUNT_BLOCK(TASK_cancelled);
1509  // this task belongs to a task group and we need to cancel it
1510  discard = 1 /* true */;
1511  }
1512  }
1513 
1514  // Invoke the task routine and pass in relevant data.
1515  // Thunks generated by gcc take a different argument list.
1516  if (!discard) {
1517  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1518  taskdata->td_last_tied = current_task->td_last_tied;
1519  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1520  }
1521 #if KMP_STATS_ENABLED
1522  KMP_COUNT_BLOCK(TASK_executed);
1523  switch (KMP_GET_THREAD_STATE()) {
1524  case FORK_JOIN_BARRIER:
1525  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1526  break;
1527  case PLAIN_BARRIER:
1528  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1529  break;
1530  case TASKYIELD:
1531  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1532  break;
1533  case TASKWAIT:
1534  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1535  break;
1536  case TASKGROUP:
1537  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1538  break;
1539  default:
1540  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1541  break;
1542  }
1543 #endif // KMP_STATS_ENABLED
1544 #endif // OMP_40_ENABLED
1545 
1546 // OMPT task begin
1547 #if OMPT_SUPPORT
1548  if (UNLIKELY(ompt_enabled.enabled))
1549  __ompt_task_start(task, current_task, gtid);
1550 #endif
1551 
1552 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1553  kmp_uint64 cur_time;
1554  kmp_int32 kmp_itt_count_task =
1555  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1556  current_task->td_flags.tasktype == TASK_IMPLICIT;
1557  if (kmp_itt_count_task) {
1558  thread = __kmp_threads[gtid];
1559  // Time outer level explicit task on barrier for adjusting imbalance time
1560  if (thread->th.th_bar_arrive_time)
1561  cur_time = __itt_get_timestamp();
1562  else
1563  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1564  }
1565 #endif
1566 
1567 #ifdef KMP_GOMP_COMPAT
1568  if (taskdata->td_flags.native) {
1569  ((void (*)(void *))(*(task->routine)))(task->shareds);
1570  } else
1571 #endif /* KMP_GOMP_COMPAT */
1572  {
1573  (*(task->routine))(gtid, task);
1574  }
1575  KMP_POP_PARTITIONED_TIMER();
1576 
1577 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1578  if (kmp_itt_count_task) {
1579  // Barrier imbalance - adjust arrive time with the task duration
1580  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1581  }
1582 #endif
1583 
1584 #if OMP_40_ENABLED
1585  }
1586 #endif // OMP_40_ENABLED
1587 
1588 
1589 #if OMP_45_ENABLED
1590  // Proxy tasks are not handled by the runtime
1591  if (taskdata->td_flags.proxy != TASK_PROXY) {
1592 #endif
1593  ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1594 #if OMPT_SUPPORT
1595  if (UNLIKELY(ompt_enabled.enabled)) {
1596  thread->th.ompt_thread_info = oldInfo;
1597  if (taskdata->td_flags.tiedness == TASK_TIED) {
1598  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1599  }
1600  __kmp_task_finish<true>(gtid, task, current_task);
1601  } else
1602 #endif
1603  __kmp_task_finish<false>(gtid, task, current_task);
1604 #if OMP_45_ENABLED
1605  }
1606 #endif
1607 
1608  KA_TRACE(
1609  30,
1610  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1611  gtid, taskdata, current_task));
1612  return;
1613 }
1614 
1615 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1616 //
1617 // loc_ref: location of original task pragma (ignored)
1618 // gtid: Global Thread ID of encountering thread
1619 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1620 // Returns:
1621 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1622 // be resumed later.
1623 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1624 // resumed later.
1625 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1626  kmp_task_t *new_task) {
1627  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1628 
1629  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1630  loc_ref, new_taskdata));
1631 
1632 #if OMPT_SUPPORT
1633  kmp_taskdata_t *parent;
1634  if (UNLIKELY(ompt_enabled.enabled)) {
1635  parent = new_taskdata->td_parent;
1636  if (ompt_enabled.ompt_callback_task_create) {
1637  ompt_data_t task_data = ompt_data_none;
1638  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1639  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1640  parent ? &(parent->ompt_task_info.frame) : NULL,
1641  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1642  OMPT_GET_RETURN_ADDRESS(0));
1643  }
1644  }
1645 #endif
1646 
1647  /* Should we execute the new task or queue it? For now, let's just always try
1648  to queue it. If the queue fills up, then we'll execute it. */
1649 
1650  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1651  { // Execute this task immediately
1652  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1653  new_taskdata->td_flags.task_serial = 1;
1654  __kmp_invoke_task(gtid, new_task, current_task);
1655  }
1656 
1657  KA_TRACE(
1658  10,
1659  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1660  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1661  gtid, loc_ref, new_taskdata));
1662 
1663  ANNOTATE_HAPPENS_BEFORE(new_task);
1664 #if OMPT_SUPPORT
1665  if (UNLIKELY(ompt_enabled.enabled)) {
1666  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1667  }
1668 #endif
1669  return TASK_CURRENT_NOT_QUEUED;
1670 }
1671 
1672 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1673 //
1674 // gtid: Global Thread ID of encountering thread
1675 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1676 // serialize_immediate: if TRUE then if the task is executed immediately its
1677 // execution will be serialized
1678 // Returns:
1679 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1680 // be resumed later.
1681 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1682 // resumed later.
1683 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1684  bool serialize_immediate) {
1685  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1686 
1687 /* Should we execute the new task or queue it? For now, let's just always try to
1688  queue it. If the queue fills up, then we'll execute it. */
1689 #if OMP_45_ENABLED
1690  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1691  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1692 #else
1693  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1694 #endif
1695  { // Execute this task immediately
1696  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1697  if (serialize_immediate)
1698  new_taskdata->td_flags.task_serial = 1;
1699  __kmp_invoke_task(gtid, new_task, current_task);
1700  }
1701 
1702  ANNOTATE_HAPPENS_BEFORE(new_task);
1703  return TASK_CURRENT_NOT_QUEUED;
1704 }
1705 
1706 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1707 // non-thread-switchable task from the parent thread only!
1708 //
1709 // loc_ref: location of original task pragma (ignored)
1710 // gtid: Global Thread ID of encountering thread
1711 // new_task: non-thread-switchable task thunk allocated by
1712 // __kmp_omp_task_alloc()
1713 // Returns:
1714 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1715 // be resumed later.
1716 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1717 // resumed later.
1718 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1719  kmp_task_t *new_task) {
1720  kmp_int32 res;
1721  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1722 
1723 #if KMP_DEBUG || OMPT_SUPPORT
1724  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1725 #endif
1726  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1727  new_taskdata));
1728 
1729 #if OMPT_SUPPORT
1730  kmp_taskdata_t *parent = NULL;
1731  if (UNLIKELY(ompt_enabled.enabled)) {
1732  if (!new_taskdata->td_flags.started) {
1733  OMPT_STORE_RETURN_ADDRESS(gtid);
1734  parent = new_taskdata->td_parent;
1735  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1736  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1737  }
1738  if (ompt_enabled.ompt_callback_task_create) {
1739  ompt_data_t task_data = ompt_data_none;
1740  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1741  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1742  parent ? &(parent->ompt_task_info.frame) : NULL,
1743  &(new_taskdata->ompt_task_info.task_data),
1744  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1745  OMPT_LOAD_RETURN_ADDRESS(gtid));
1746  }
1747  } else {
1748  // We are scheduling the continuation of an UNTIED task.
1749  // Scheduling back to the parent task.
1750  __ompt_task_finish(new_task,
1751  new_taskdata->ompt_task_info.scheduling_parent,
1752  ompt_task_switch);
1753  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1754  }
1755  }
1756 #endif
1757 
1758  res = __kmp_omp_task(gtid, new_task, true);
1759 
1760  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1761  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1762  gtid, loc_ref, new_taskdata));
1763 #if OMPT_SUPPORT
1764  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1765  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1766  }
1767 #endif
1768  return res;
1769 }
1770 
1771 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1772 // a taskloop task with the correct OMPT return address
1773 //
1774 // loc_ref: location of original task pragma (ignored)
1775 // gtid: Global Thread ID of encountering thread
1776 // new_task: non-thread-switchable task thunk allocated by
1777 // __kmp_omp_task_alloc()
1778 // codeptr_ra: return address for OMPT callback
1779 // Returns:
1780 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1781 // be resumed later.
1782 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1783 // resumed later.
1784 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1785  kmp_task_t *new_task, void *codeptr_ra) {
1786  kmp_int32 res;
1787  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1788 
1789 #if KMP_DEBUG || OMPT_SUPPORT
1790  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1791 #endif
1792  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1793  new_taskdata));
1794 
1795 #if OMPT_SUPPORT
1796  kmp_taskdata_t *parent = NULL;
1797  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1798  parent = new_taskdata->td_parent;
1799  if (!parent->ompt_task_info.frame.enter_frame.ptr)
1800  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1801  if (ompt_enabled.ompt_callback_task_create) {
1802  ompt_data_t task_data = ompt_data_none;
1803  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1804  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1805  parent ? &(parent->ompt_task_info.frame) : NULL,
1806  &(new_taskdata->ompt_task_info.task_data),
1807  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1808  codeptr_ra);
1809  }
1810  }
1811 #endif
1812 
1813  res = __kmp_omp_task(gtid, new_task, true);
1814 
1815  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1816  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1817  gtid, loc_ref, new_taskdata));
1818 #if OMPT_SUPPORT
1819  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1820  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1821  }
1822 #endif
1823  return res;
1824 }
1825 
1826 template <bool ompt>
1827 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1828  void *frame_address,
1829  void *return_address) {
1830  kmp_taskdata_t *taskdata;
1831  kmp_info_t *thread;
1832  int thread_finished = FALSE;
1833  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1834 
1835  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1836 
1837  if (__kmp_tasking_mode != tskm_immediate_exec) {
1838  thread = __kmp_threads[gtid];
1839  taskdata = thread->th.th_current_task;
1840 
1841 #if OMPT_SUPPORT && OMPT_OPTIONAL
1842  ompt_data_t *my_task_data;
1843  ompt_data_t *my_parallel_data;
1844 
1845  if (ompt) {
1846  my_task_data = &(taskdata->ompt_task_info.task_data);
1847  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1848 
1849  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1850 
1851  if (ompt_enabled.ompt_callback_sync_region) {
1852  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1853  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1854  my_task_data, return_address);
1855  }
1856 
1857  if (ompt_enabled.ompt_callback_sync_region_wait) {
1858  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1859  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1860  my_task_data, return_address);
1861  }
1862  }
1863 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1864 
1865 // Debugger: The taskwait is active. Store location and thread encountered the
1866 // taskwait.
1867 #if USE_ITT_BUILD
1868 // Note: These values are used by ITT events as well.
1869 #endif /* USE_ITT_BUILD */
1870  taskdata->td_taskwait_counter += 1;
1871  taskdata->td_taskwait_ident = loc_ref;
1872  taskdata->td_taskwait_thread = gtid + 1;
1873 
1874 #if USE_ITT_BUILD
1875  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1876  if (itt_sync_obj != NULL)
1877  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1878 #endif /* USE_ITT_BUILD */
1879 
1880  bool must_wait =
1881  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1882 
1883 #if OMP_45_ENABLED
1884  must_wait = must_wait || (thread->th.th_task_team != NULL &&
1885  thread->th.th_task_team->tt.tt_found_proxy_tasks);
1886 #endif
1887  if (must_wait) {
1888  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1889  &(taskdata->td_incomplete_child_tasks)),
1890  0U);
1891  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1892  flag.execute_tasks(thread, gtid, FALSE,
1893  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1894  __kmp_task_stealing_constraint);
1895  }
1896  }
1897 #if USE_ITT_BUILD
1898  if (itt_sync_obj != NULL)
1899  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1900 #endif /* USE_ITT_BUILD */
1901 
1902  // Debugger: The taskwait is completed. Location remains, but thread is
1903  // negated.
1904  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1905 
1906 #if OMPT_SUPPORT && OMPT_OPTIONAL
1907  if (ompt) {
1908  if (ompt_enabled.ompt_callback_sync_region_wait) {
1909  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1910  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1911  my_task_data, return_address);
1912  }
1913  if (ompt_enabled.ompt_callback_sync_region) {
1914  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1915  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1916  my_task_data, return_address);
1917  }
1918  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1919  }
1920 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1921 
1922  ANNOTATE_HAPPENS_AFTER(taskdata);
1923  }
1924 
1925  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1926  "returning TASK_CURRENT_NOT_QUEUED\n",
1927  gtid, taskdata));
1928 
1929  return TASK_CURRENT_NOT_QUEUED;
1930 }
1931 
1932 #if OMPT_SUPPORT && OMPT_OPTIONAL
1933 OMPT_NOINLINE
1934 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1935  void *frame_address,
1936  void *return_address) {
1937  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1938  return_address);
1939 }
1940 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1941 
1942 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1943 // complete
1944 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1945 #if OMPT_SUPPORT && OMPT_OPTIONAL
1946  if (UNLIKELY(ompt_enabled.enabled)) {
1947  OMPT_STORE_RETURN_ADDRESS(gtid);
1948  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1949  OMPT_LOAD_RETURN_ADDRESS(gtid));
1950  }
1951 #endif
1952  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1953 }
1954 
1955 // __kmpc_omp_taskyield: switch to a different task
1956 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1957  kmp_taskdata_t *taskdata;
1958  kmp_info_t *thread;
1959  int thread_finished = FALSE;
1960 
1961  KMP_COUNT_BLOCK(OMP_TASKYIELD);
1962  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1963 
1964  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1965  gtid, loc_ref, end_part));
1966 
1967  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1968  thread = __kmp_threads[gtid];
1969  taskdata = thread->th.th_current_task;
1970 // Should we model this as a task wait or not?
1971 // Debugger: The taskwait is active. Store location and thread encountered the
1972 // taskwait.
1973 #if USE_ITT_BUILD
1974 // Note: These values are used by ITT events as well.
1975 #endif /* USE_ITT_BUILD */
1976  taskdata->td_taskwait_counter += 1;
1977  taskdata->td_taskwait_ident = loc_ref;
1978  taskdata->td_taskwait_thread = gtid + 1;
1979 
1980 #if USE_ITT_BUILD
1981  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1982  if (itt_sync_obj != NULL)
1983  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1984 #endif /* USE_ITT_BUILD */
1985  if (!taskdata->td_flags.team_serial) {
1986  kmp_task_team_t *task_team = thread->th.th_task_team;
1987  if (task_team != NULL) {
1988  if (KMP_TASKING_ENABLED(task_team)) {
1989 #if OMPT_SUPPORT
1990  if (UNLIKELY(ompt_enabled.enabled))
1991  thread->th.ompt_thread_info.ompt_task_yielded = 1;
1992 #endif
1993  __kmp_execute_tasks_32(
1994  thread, gtid, NULL, FALSE,
1995  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1996  __kmp_task_stealing_constraint);
1997 #if OMPT_SUPPORT
1998  if (UNLIKELY(ompt_enabled.enabled))
1999  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2000 #endif
2001  }
2002  }
2003  }
2004 #if USE_ITT_BUILD
2005  if (itt_sync_obj != NULL)
2006  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2007 #endif /* USE_ITT_BUILD */
2008 
2009  // Debugger: The taskwait is completed. Location remains, but thread is
2010  // negated.
2011  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2012  }
2013 
2014  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2015  "returning TASK_CURRENT_NOT_QUEUED\n",
2016  gtid, taskdata));
2017 
2018  return TASK_CURRENT_NOT_QUEUED;
2019 }
2020 
2021 #if OMP_50_ENABLED
2022 // Task Reduction implementation
2023 
2024 typedef struct kmp_task_red_flags {
2025  unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
2026  unsigned reserved31 : 31;
2027 } kmp_task_red_flags_t;
2028 
2029 // internal structure for reduction data item related info
2030 typedef struct kmp_task_red_data {
2031  void *reduce_shar; // shared reduction item
2032  size_t reduce_size; // size of data item
2033  void *reduce_priv; // thread specific data
2034  void *reduce_pend; // end of private data for comparison op
2035  void *reduce_init; // data initialization routine
2036  void *reduce_fini; // data finalization routine
2037  void *reduce_comb; // data combiner routine
2038  kmp_task_red_flags_t flags; // flags for additional info from compiler
2039 } kmp_task_red_data_t;
2040 
2041 // structure sent us by compiler - one per reduction item
2042 typedef struct kmp_task_red_input {
2043  void *reduce_shar; // shared reduction item
2044  size_t reduce_size; // size of data item
2045  void *reduce_init; // data initialization routine
2046  void *reduce_fini; // data finalization routine
2047  void *reduce_comb; // data combiner routine
2048  kmp_task_red_flags_t flags; // flags for additional info from compiler
2049 } kmp_task_red_input_t;
2050 
2060 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2061  kmp_info_t *thread = __kmp_threads[gtid];
2062  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2063  kmp_int32 nth = thread->th.th_team_nproc;
2064  kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
2065  kmp_task_red_data_t *arr;
2066 
2067  // check input data just in case
2068  KMP_ASSERT(tg != NULL);
2069  KMP_ASSERT(data != NULL);
2070  KMP_ASSERT(num > 0);
2071  if (nth == 1) {
2072  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2073  gtid, tg));
2074  return (void *)tg;
2075  }
2076  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2077  gtid, tg, num));
2078  arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
2079  thread, num * sizeof(kmp_task_red_data_t));
2080  for (int i = 0; i < num; ++i) {
2081  void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
2082  size_t size = input[i].reduce_size - 1;
2083  // round the size up to cache line per thread-specific item
2084  size += CACHE_LINE - size % CACHE_LINE;
2085  KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
2086  arr[i].reduce_shar = input[i].reduce_shar;
2087  arr[i].reduce_size = size;
2088  arr[i].reduce_init = input[i].reduce_init;
2089  arr[i].reduce_fini = input[i].reduce_fini;
2090  arr[i].reduce_comb = input[i].reduce_comb;
2091  arr[i].flags = input[i].flags;
2092  if (!input[i].flags.lazy_priv) {
2093  // allocate cache-line aligned block and fill it with zeros
2094  arr[i].reduce_priv = __kmp_allocate(nth * size);
2095  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2096  if (f_init != NULL) {
2097  // initialize thread-specific items
2098  for (int j = 0; j < nth; ++j) {
2099  f_init((char *)(arr[i].reduce_priv) + j * size);
2100  }
2101  }
2102  } else {
2103  // only allocate space for pointers now,
2104  // objects will be lazily allocated/initialized once requested
2105  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2106  }
2107  }
2108  tg->reduce_data = (void *)arr;
2109  tg->reduce_num_data = num;
2110  return (void *)tg;
2111 }
2112 
2122 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2123  kmp_info_t *thread = __kmp_threads[gtid];
2124  kmp_int32 nth = thread->th.th_team_nproc;
2125  if (nth == 1)
2126  return data; // nothing to do
2127 
2128  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2129  if (tg == NULL)
2130  tg = thread->th.th_current_task->td_taskgroup;
2131  KMP_ASSERT(tg != NULL);
2132  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
2133  kmp_int32 num = tg->reduce_num_data;
2134  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2135 
2136  KMP_ASSERT(data != NULL);
2137  while (tg != NULL) {
2138  for (int i = 0; i < num; ++i) {
2139  if (!arr[i].flags.lazy_priv) {
2140  if (data == arr[i].reduce_shar ||
2141  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2142  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2143  } else {
2144  // check shared location first
2145  void **p_priv = (void **)(arr[i].reduce_priv);
2146  if (data == arr[i].reduce_shar)
2147  goto found;
2148  // check if we get some thread specific location as parameter
2149  for (int j = 0; j < nth; ++j)
2150  if (data == p_priv[j])
2151  goto found;
2152  continue; // not found, continue search
2153  found:
2154  if (p_priv[tid] == NULL) {
2155  // allocate thread specific object lazily
2156  void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
2157  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2158  if (f_init != NULL) {
2159  f_init(p_priv[tid]);
2160  }
2161  }
2162  return p_priv[tid];
2163  }
2164  }
2165  tg = tg->parent;
2166  arr = (kmp_task_red_data_t *)(tg->reduce_data);
2167  num = tg->reduce_num_data;
2168  }
2169  KMP_ASSERT2(0, "Unknown task reduction item");
2170  return NULL; // ERROR, this line never executed
2171 }
2172 
2173 // Finalize task reduction.
2174 // Called from __kmpc_end_taskgroup()
2175 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2176  kmp_int32 nth = th->th.th_team_nproc;
2177  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2178  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
2179  kmp_int32 num = tg->reduce_num_data;
2180  for (int i = 0; i < num; ++i) {
2181  void *sh_data = arr[i].reduce_shar;
2182  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2183  void (*f_comb)(void *, void *) =
2184  (void (*)(void *, void *))(arr[i].reduce_comb);
2185  if (!arr[i].flags.lazy_priv) {
2186  void *pr_data = arr[i].reduce_priv;
2187  size_t size = arr[i].reduce_size;
2188  for (int j = 0; j < nth; ++j) {
2189  void *priv_data = (char *)pr_data + j * size;
2190  f_comb(sh_data, priv_data); // combine results
2191  if (f_fini)
2192  f_fini(priv_data); // finalize if needed
2193  }
2194  } else {
2195  void **pr_data = (void **)(arr[i].reduce_priv);
2196  for (int j = 0; j < nth; ++j) {
2197  if (pr_data[j] != NULL) {
2198  f_comb(sh_data, pr_data[j]); // combine results
2199  if (f_fini)
2200  f_fini(pr_data[j]); // finalize if needed
2201  __kmp_free(pr_data[j]);
2202  }
2203  }
2204  }
2205  __kmp_free(arr[i].reduce_priv);
2206  }
2207  __kmp_thread_free(th, arr);
2208  tg->reduce_data = NULL;
2209  tg->reduce_num_data = 0;
2210 }
2211 #endif
2212 
2213 #if OMP_40_ENABLED
2214 // __kmpc_taskgroup: Start a new taskgroup
2215 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2216  kmp_info_t *thread = __kmp_threads[gtid];
2217  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2218  kmp_taskgroup_t *tg_new =
2219  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2220  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2221  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2222  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2223  tg_new->parent = taskdata->td_taskgroup;
2224 #if OMP_50_ENABLED
2225  tg_new->reduce_data = NULL;
2226  tg_new->reduce_num_data = 0;
2227 #endif
2228  taskdata->td_taskgroup = tg_new;
2229 
2230 #if OMPT_SUPPORT && OMPT_OPTIONAL
2231  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2232  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2233  if (!codeptr)
2234  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2235  kmp_team_t *team = thread->th.th_team;
2236  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2237  // FIXME: I think this is wrong for lwt!
2238  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2239 
2240  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2241  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2242  &(my_task_data), codeptr);
2243  }
2244 #endif
2245 }
2246 
2247 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2248 // and its descendants are complete
2249 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2250  kmp_info_t *thread = __kmp_threads[gtid];
2251  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2252  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2253  int thread_finished = FALSE;
2254 
2255 #if OMPT_SUPPORT && OMPT_OPTIONAL
2256  kmp_team_t *team;
2257  ompt_data_t my_task_data;
2258  ompt_data_t my_parallel_data;
2259  void *codeptr;
2260  if (UNLIKELY(ompt_enabled.enabled)) {
2261  team = thread->th.th_team;
2262  my_task_data = taskdata->ompt_task_info.task_data;
2263  // FIXME: I think this is wrong for lwt!
2264  my_parallel_data = team->t.ompt_team_info.parallel_data;
2265  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2266  if (!codeptr)
2267  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2268  }
2269 #endif
2270 
2271  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2272  KMP_DEBUG_ASSERT(taskgroup != NULL);
2273  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2274 
2275  if (__kmp_tasking_mode != tskm_immediate_exec) {
2276  // mark task as waiting not on a barrier
2277  taskdata->td_taskwait_counter += 1;
2278  taskdata->td_taskwait_ident = loc;
2279  taskdata->td_taskwait_thread = gtid + 1;
2280 #if USE_ITT_BUILD
2281  // For ITT the taskgroup wait is similar to taskwait until we need to
2282  // distinguish them
2283  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2284  if (itt_sync_obj != NULL)
2285  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2286 #endif /* USE_ITT_BUILD */
2287 
2288 #if OMPT_SUPPORT && OMPT_OPTIONAL
2289  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2290  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2291  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2292  &(my_task_data), codeptr);
2293  }
2294 #endif
2295 
2296 #if OMP_45_ENABLED
2297  if (!taskdata->td_flags.team_serial ||
2298  (thread->th.th_task_team != NULL &&
2299  thread->th.th_task_team->tt.tt_found_proxy_tasks))
2300 #else
2301  if (!taskdata->td_flags.team_serial)
2302 #endif
2303  {
2304  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2305  0U);
2306  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2307  flag.execute_tasks(thread, gtid, FALSE,
2308  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2309  __kmp_task_stealing_constraint);
2310  }
2311  }
2312  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2313 
2314 #if OMPT_SUPPORT && OMPT_OPTIONAL
2315  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2316  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2317  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2318  &(my_task_data), codeptr);
2319  }
2320 #endif
2321 
2322 #if USE_ITT_BUILD
2323  if (itt_sync_obj != NULL)
2324  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2325 #endif /* USE_ITT_BUILD */
2326  }
2327  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2328 
2329 #if OMP_50_ENABLED
2330  if (taskgroup->reduce_data != NULL) // need to reduce?
2331  __kmp_task_reduction_fini(thread, taskgroup);
2332 #endif
2333  // Restore parent taskgroup for the current task
2334  taskdata->td_taskgroup = taskgroup->parent;
2335  __kmp_thread_free(thread, taskgroup);
2336 
2337  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2338  gtid, taskdata));
2339  ANNOTATE_HAPPENS_AFTER(taskdata);
2340 
2341 #if OMPT_SUPPORT && OMPT_OPTIONAL
2342  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2343  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2344  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2345  &(my_task_data), codeptr);
2346  }
2347 #endif
2348 }
2349 #endif
2350 
2351 // __kmp_remove_my_task: remove a task from my own deque
2352 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2353  kmp_task_team_t *task_team,
2354  kmp_int32 is_constrained) {
2355  kmp_task_t *task;
2356  kmp_taskdata_t *taskdata;
2357  kmp_thread_data_t *thread_data;
2358  kmp_uint32 tail;
2359 
2360  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2361  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2362  NULL); // Caller should check this condition
2363 
2364  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2365 
2366  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2367  gtid, thread_data->td.td_deque_ntasks,
2368  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2369 
2370  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2371  KA_TRACE(10,
2372  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2373  "ntasks=%d head=%u tail=%u\n",
2374  gtid, thread_data->td.td_deque_ntasks,
2375  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2376  return NULL;
2377  }
2378 
2379  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2380 
2381  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2382  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2383  KA_TRACE(10,
2384  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2385  "ntasks=%d head=%u tail=%u\n",
2386  gtid, thread_data->td.td_deque_ntasks,
2387  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2388  return NULL;
2389  }
2390 
2391  tail = (thread_data->td.td_deque_tail - 1) &
2392  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2393  taskdata = thread_data->td.td_deque[tail];
2394 
2395  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2396  thread->th.th_current_task)) {
2397  // The TSC does not allow to steal victim task
2398  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2399  KA_TRACE(10,
2400  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2401  "ntasks=%d head=%u tail=%u\n",
2402  gtid, thread_data->td.td_deque_ntasks,
2403  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2404  return NULL;
2405  }
2406 
2407  thread_data->td.td_deque_tail = tail;
2408  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2409 
2410  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2411 
2412  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2413  "ntasks=%d head=%u tail=%u\n",
2414  gtid, taskdata, thread_data->td.td_deque_ntasks,
2415  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2416 
2417  task = KMP_TASKDATA_TO_TASK(taskdata);
2418  return task;
2419 }
2420 
2421 // __kmp_steal_task: remove a task from another thread's deque
2422 // Assume that calling thread has already checked existence of
2423 // task_team thread_data before calling this routine.
2424 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2425  kmp_task_team_t *task_team,
2426  std::atomic<kmp_int32> *unfinished_threads,
2427  int *thread_finished,
2428  kmp_int32 is_constrained) {
2429  kmp_task_t *task;
2430  kmp_taskdata_t *taskdata;
2431  kmp_taskdata_t *current;
2432  kmp_thread_data_t *victim_td, *threads_data;
2433  kmp_int32 target;
2434  kmp_int32 victim_tid;
2435 
2436  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2437 
2438  threads_data = task_team->tt.tt_threads_data;
2439  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2440 
2441  victim_tid = victim_thr->th.th_info.ds.ds_tid;
2442  victim_td = &threads_data[victim_tid];
2443 
2444  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2445  "task_team=%p ntasks=%d head=%u tail=%u\n",
2446  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2447  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2448  victim_td->td.td_deque_tail));
2449 
2450  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2451  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2452  "task_team=%p ntasks=%d head=%u tail=%u\n",
2453  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2454  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2455  victim_td->td.td_deque_tail));
2456  return NULL;
2457  }
2458 
2459  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2460 
2461  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2462  // Check again after we acquire the lock
2463  if (ntasks == 0) {
2464  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2465  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2466  "task_team=%p ntasks=%d head=%u tail=%u\n",
2467  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2468  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2469  return NULL;
2470  }
2471 
2472  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2473  current = __kmp_threads[gtid]->th.th_current_task;
2474  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2475  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2476  // Bump head pointer and Wrap.
2477  victim_td->td.td_deque_head =
2478  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2479  } else {
2480  if (!task_team->tt.tt_untied_task_encountered) {
2481  // The TSC does not allow to steal victim task
2482  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2483  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2484  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2485  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2486  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2487  return NULL;
2488  }
2489  int i;
2490  // walk through victim's deque trying to steal any task
2491  target = victim_td->td.td_deque_head;
2492  taskdata = NULL;
2493  for (i = 1; i < ntasks; ++i) {
2494  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2495  taskdata = victim_td->td.td_deque[target];
2496  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2497  break; // found victim task
2498  } else {
2499  taskdata = NULL;
2500  }
2501  }
2502  if (taskdata == NULL) {
2503  // No appropriate candidate to steal found
2504  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2505  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2506  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2507  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2508  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2509  return NULL;
2510  }
2511  int prev = target;
2512  for (i = i + 1; i < ntasks; ++i) {
2513  // shift remaining tasks in the deque left by 1
2514  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2515  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2516  prev = target;
2517  }
2518  KMP_DEBUG_ASSERT(
2519  victim_td->td.td_deque_tail ==
2520  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2521  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2522  }
2523  if (*thread_finished) {
2524  // We need to un-mark this victim as a finished victim. This must be done
2525  // before releasing the lock, or else other threads (starting with the
2526  // master victim) might be prematurely released from the barrier!!!
2527  kmp_int32 count;
2528 
2529  count = KMP_ATOMIC_INC(unfinished_threads);
2530 
2531  KA_TRACE(
2532  20,
2533  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2534  gtid, count + 1, task_team));
2535 
2536  *thread_finished = FALSE;
2537  }
2538  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2539 
2540  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2541 
2542  KMP_COUNT_BLOCK(TASK_stolen);
2543  KA_TRACE(10,
2544  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2545  "task_team=%p ntasks=%d head=%u tail=%u\n",
2546  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2547  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2548 
2549  task = KMP_TASKDATA_TO_TASK(taskdata);
2550  return task;
2551 }
2552 
2553 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2554 // condition is statisfied (return true) or there are none left (return false).
2555 //
2556 // final_spin is TRUE if this is the spin at the release barrier.
2557 // thread_finished indicates whether the thread is finished executing all
2558 // the tasks it has on its deque, and is at the release barrier.
2559 // spinner is the location on which to spin.
2560 // spinner == NULL means only execute a single task and return.
2561 // checker is the value to check to terminate the spin.
2562 template <class C>
2563 static inline int __kmp_execute_tasks_template(
2564  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2565  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2566  kmp_int32 is_constrained) {
2567  kmp_task_team_t *task_team = thread->th.th_task_team;
2568  kmp_thread_data_t *threads_data;
2569  kmp_task_t *task;
2570  kmp_info_t *other_thread;
2571  kmp_taskdata_t *current_task = thread->th.th_current_task;
2572  std::atomic<kmp_int32> *unfinished_threads;
2573  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2574  tid = thread->th.th_info.ds.ds_tid;
2575 
2576  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2577  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2578 
2579  if (task_team == NULL || current_task == NULL)
2580  return FALSE;
2581 
2582  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2583  "*thread_finished=%d\n",
2584  gtid, final_spin, *thread_finished));
2585 
2586  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2587  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2588  KMP_DEBUG_ASSERT(threads_data != NULL);
2589 
2590  nthreads = task_team->tt.tt_nproc;
2591  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2592 #if OMP_45_ENABLED
2593  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2594 #else
2595  KMP_DEBUG_ASSERT(nthreads > 1);
2596 #endif
2597  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2598 
2599  while (1) { // Outer loop keeps trying to find tasks in case of single thread
2600  // getting tasks from target constructs
2601  while (1) { // Inner loop to find a task and execute it
2602  task = NULL;
2603  if (use_own_tasks) { // check on own queue first
2604  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2605  }
2606  if ((task == NULL) && (nthreads > 1)) { // Steal a task
2607  int asleep = 1;
2608  use_own_tasks = 0;
2609  // Try to steal from the last place I stole from successfully.
2610  if (victim_tid == -2) { // haven't stolen anything yet
2611  victim_tid = threads_data[tid].td.td_deque_last_stolen;
2612  if (victim_tid !=
2613  -1) // if we have a last stolen from victim, get the thread
2614  other_thread = threads_data[victim_tid].td.td_thr;
2615  }
2616  if (victim_tid != -1) { // found last victim
2617  asleep = 0;
2618  } else if (!new_victim) { // no recent steals and we haven't already
2619  // used a new victim; select a random thread
2620  do { // Find a different thread to steal work from.
2621  // Pick a random thread. Initial plan was to cycle through all the
2622  // threads, and only return if we tried to steal from every thread,
2623  // and failed. Arch says that's not such a great idea.
2624  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2625  if (victim_tid >= tid) {
2626  ++victim_tid; // Adjusts random distribution to exclude self
2627  }
2628  // Found a potential victim
2629  other_thread = threads_data[victim_tid].td.td_thr;
2630  // There is a slight chance that __kmp_enable_tasking() did not wake
2631  // up all threads waiting at the barrier. If victim is sleeping,
2632  // then wake it up. Since we were going to pay the cache miss
2633  // penalty for referencing another thread's kmp_info_t struct
2634  // anyway,
2635  // the check shouldn't cost too much performance at this point. In
2636  // extra barrier mode, tasks do not sleep at the separate tasking
2637  // barrier, so this isn't a problem.
2638  asleep = 0;
2639  if ((__kmp_tasking_mode == tskm_task_teams) &&
2640  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2641  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2642  NULL)) {
2643  asleep = 1;
2644  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2645  other_thread->th.th_sleep_loc);
2646  // A sleeping thread should not have any tasks on it's queue.
2647  // There is a slight possibility that it resumes, steals a task
2648  // from another thread, which spawns more tasks, all in the time
2649  // that it takes this thread to check => don't write an assertion
2650  // that the victim's queue is empty. Try stealing from a
2651  // different thread.
2652  }
2653  } while (asleep);
2654  }
2655 
2656  if (!asleep) {
2657  // We have a victim to try to steal from
2658  task = __kmp_steal_task(other_thread, gtid, task_team,
2659  unfinished_threads, thread_finished,
2660  is_constrained);
2661  }
2662  if (task != NULL) { // set last stolen to victim
2663  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2664  threads_data[tid].td.td_deque_last_stolen = victim_tid;
2665  // The pre-refactored code did not try more than 1 successful new
2666  // vicitm, unless the last one generated more local tasks;
2667  // new_victim keeps track of this
2668  new_victim = 1;
2669  }
2670  } else { // No tasks found; unset last_stolen
2671  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2672  victim_tid = -2; // no successful victim found
2673  }
2674  }
2675 
2676  if (task == NULL) // break out of tasking loop
2677  break;
2678 
2679 // Found a task; execute it
2680 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2681  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2682  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2683  // get the object reliably
2684  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2685  }
2686  __kmp_itt_task_starting(itt_sync_obj);
2687  }
2688 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2689  __kmp_invoke_task(gtid, task, current_task);
2690 #if USE_ITT_BUILD
2691  if (itt_sync_obj != NULL)
2692  __kmp_itt_task_finished(itt_sync_obj);
2693 #endif /* USE_ITT_BUILD */
2694  // If this thread is only partway through the barrier and the condition is
2695  // met, then return now, so that the barrier gather/release pattern can
2696  // proceed. If this thread is in the last spin loop in the barrier,
2697  // waiting to be released, we know that the termination condition will not
2698  // be satisified, so don't waste any cycles checking it.
2699  if (flag == NULL || (!final_spin && flag->done_check())) {
2700  KA_TRACE(
2701  15,
2702  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2703  gtid));
2704  return TRUE;
2705  }
2706  if (thread->th.th_task_team == NULL) {
2707  break;
2708  }
2709  // Yield before executing next task
2710  KMP_YIELD(__kmp_library == library_throughput);
2711  // If execution of a stolen task results in more tasks being placed on our
2712  // run queue, reset use_own_tasks
2713  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2714  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2715  "other tasks, restart\n",
2716  gtid));
2717  use_own_tasks = 1;
2718  new_victim = 0;
2719  }
2720  }
2721 
2722 // The task source has been exhausted. If in final spin loop of barrier, check
2723 // if termination condition is satisfied.
2724 #if OMP_45_ENABLED
2725  // The work queue may be empty but there might be proxy tasks still
2726  // executing
2727  if (final_spin &&
2728  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0)
2729 #else
2730  if (final_spin)
2731 #endif
2732  {
2733  // First, decrement the #unfinished threads, if that has not already been
2734  // done. This decrement might be to the spin location, and result in the
2735  // termination condition being satisfied.
2736  if (!*thread_finished) {
2737  kmp_int32 count;
2738 
2739  count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
2740  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2741  "unfinished_threads to %d task_team=%p\n",
2742  gtid, count, task_team));
2743  *thread_finished = TRUE;
2744  }
2745 
2746  // It is now unsafe to reference thread->th.th_team !!!
2747  // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2748  // thread to pass through the barrier, where it might reset each thread's
2749  // th.th_team field for the next parallel region. If we can steal more
2750  // work, we know that this has not happened yet.
2751  if (flag != NULL && flag->done_check()) {
2752  KA_TRACE(
2753  15,
2754  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2755  gtid));
2756  return TRUE;
2757  }
2758  }
2759 
2760  // If this thread's task team is NULL, master has recognized that there are
2761  // no more tasks; bail out
2762  if (thread->th.th_task_team == NULL) {
2763  KA_TRACE(15,
2764  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2765  return FALSE;
2766  }
2767 
2768 #if OMP_45_ENABLED
2769  // We could be getting tasks from target constructs; if this is the only
2770  // thread, keep trying to execute tasks from own queue
2771  if (nthreads == 1)
2772  use_own_tasks = 1;
2773  else
2774 #endif
2775  {
2776  KA_TRACE(15,
2777  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
2778  return FALSE;
2779  }
2780  }
2781 }
2782 
2783 int __kmp_execute_tasks_32(
2784  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2785  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2786  kmp_int32 is_constrained) {
2787  return __kmp_execute_tasks_template(
2788  thread, gtid, flag, final_spin,
2789  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2790 }
2791 
2792 int __kmp_execute_tasks_64(
2793  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2794  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2795  kmp_int32 is_constrained) {
2796  return __kmp_execute_tasks_template(
2797  thread, gtid, flag, final_spin,
2798  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2799 }
2800 
2801 int __kmp_execute_tasks_oncore(
2802  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2803  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2804  kmp_int32 is_constrained) {
2805  return __kmp_execute_tasks_template(
2806  thread, gtid, flag, final_spin,
2807  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2808 }
2809 
2810 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2811 // next barrier so they can assist in executing enqueued tasks.
2812 // First thread in allocates the task team atomically.
2813 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
2814  kmp_info_t *this_thr) {
2815  kmp_thread_data_t *threads_data;
2816  int nthreads, i, is_init_thread;
2817 
2818  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
2819  __kmp_gtid_from_thread(this_thr)));
2820 
2821  KMP_DEBUG_ASSERT(task_team != NULL);
2822  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2823 
2824  nthreads = task_team->tt.tt_nproc;
2825  KMP_DEBUG_ASSERT(nthreads > 0);
2826  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2827 
2828  // Allocate or increase the size of threads_data if necessary
2829  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
2830 
2831  if (!is_init_thread) {
2832  // Some other thread already set up the array.
2833  KA_TRACE(
2834  20,
2835  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2836  __kmp_gtid_from_thread(this_thr)));
2837  return;
2838  }
2839  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2840  KMP_DEBUG_ASSERT(threads_data != NULL);
2841 
2842  if ((__kmp_tasking_mode == tskm_task_teams) &&
2843  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
2844  // Release any threads sleeping at the barrier, so that they can steal
2845  // tasks and execute them. In extra barrier mode, tasks do not sleep
2846  // at the separate tasking barrier, so this isn't a problem.
2847  for (i = 0; i < nthreads; i++) {
2848  volatile void *sleep_loc;
2849  kmp_info_t *thread = threads_data[i].td.td_thr;
2850 
2851  if (i == this_thr->th.th_info.ds.ds_tid) {
2852  continue;
2853  }
2854  // Since we haven't locked the thread's suspend mutex lock at this
2855  // point, there is a small window where a thread might be putting
2856  // itself to sleep, but hasn't set the th_sleep_loc field yet.
2857  // To work around this, __kmp_execute_tasks_template() periodically checks
2858  // see if other threads are sleeping (using the same random mechanism that
2859  // is used for task stealing) and awakens them if they are.
2860  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
2861  NULL) {
2862  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2863  __kmp_gtid_from_thread(this_thr),
2864  __kmp_gtid_from_thread(thread)));
2865  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2866  } else {
2867  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2868  __kmp_gtid_from_thread(this_thr),
2869  __kmp_gtid_from_thread(thread)));
2870  }
2871  }
2872  }
2873 
2874  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
2875  __kmp_gtid_from_thread(this_thr)));
2876 }
2877 
2878 /* // TODO: Check the comment consistency
2879  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
2880  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2881  * After a child * thread checks into a barrier and calls __kmp_release() from
2882  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2883  * longer assume that the kmp_team_t structure is intact (at any moment, the
2884  * master thread may exit the barrier code and free the team data structure,
2885  * and return the threads to the thread pool).
2886  *
2887  * This does not work with the the tasking code, as the thread is still
2888  * expected to participate in the execution of any tasks that may have been
2889  * spawned my a member of the team, and the thread still needs access to all
2890  * to each thread in the team, so that it can steal work from it.
2891  *
2892  * Enter the existence of the kmp_task_team_t struct. It employs a reference
2893  * counting mechanims, and is allocated by the master thread before calling
2894  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2895  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
2896  * of the kmp_task_team_t structs for consecutive barriers can overlap
2897  * (and will, unless the master thread is the last thread to exit the barrier
2898  * release phase, which is not typical).
2899  *
2900  * The existence of such a struct is useful outside the context of tasking,
2901  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2902  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2903  * libraries.
2904  *
2905  * We currently use the existence of the threads array as an indicator that
2906  * tasks were spawned since the last barrier. If the structure is to be
2907  * useful outside the context of tasking, then this will have to change, but
2908  * not settting the field minimizes the performance impact of tasking on
2909  * barriers, when no explicit tasks were spawned (pushed, actually).
2910  */
2911 
2912 static kmp_task_team_t *__kmp_free_task_teams =
2913  NULL; // Free list for task_team data structures
2914 // Lock for task team data structures
2915 kmp_bootstrap_lock_t __kmp_task_team_lock =
2916  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
2917 
2918 // __kmp_alloc_task_deque:
2919 // Allocates a task deque for a particular thread, and initialize the necessary
2920 // data structures relating to the deque. This only happens once per thread
2921 // per task team since task teams are recycled. No lock is needed during
2922 // allocation since each thread allocates its own deque.
2923 static void __kmp_alloc_task_deque(kmp_info_t *thread,
2924  kmp_thread_data_t *thread_data) {
2925  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
2926  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
2927 
2928  // Initialize last stolen task field to "none"
2929  thread_data->td.td_deque_last_stolen = -1;
2930 
2931  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
2932  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
2933  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
2934 
2935  KE_TRACE(
2936  10,
2937  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2938  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
2939  // Allocate space for task deque, and zero the deque
2940  // Cannot use __kmp_thread_calloc() because threads not around for
2941  // kmp_reap_task_team( ).
2942  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
2943  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2944  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2945 }
2946 
2947 // __kmp_free_task_deque:
2948 // Deallocates a task deque for a particular thread. Happens at library
2949 // deallocation so don't need to reset all thread data fields.
2950 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
2951  if (thread_data->td.td_deque != NULL) {
2952  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2953  TCW_4(thread_data->td.td_deque_ntasks, 0);
2954  __kmp_free(thread_data->td.td_deque);
2955  thread_data->td.td_deque = NULL;
2956  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2957  }
2958 
2959 #ifdef BUILD_TIED_TASK_STACK
2960  // GEH: Figure out what to do here for td_susp_tied_tasks
2961  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
2962  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
2963  }
2964 #endif // BUILD_TIED_TASK_STACK
2965 }
2966 
2967 // __kmp_realloc_task_threads_data:
2968 // Allocates a threads_data array for a task team, either by allocating an
2969 // initial array or enlarging an existing array. Only the first thread to get
2970 // the lock allocs or enlarges the array and re-initializes the array eleemnts.
2971 // That thread returns "TRUE", the rest return "FALSE".
2972 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2973 // The current size is given by task_team -> tt.tt_max_threads.
2974 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
2975  kmp_task_team_t *task_team) {
2976  kmp_thread_data_t **threads_data_p;
2977  kmp_int32 nthreads, maxthreads;
2978  int is_init_thread = FALSE;
2979 
2980  if (TCR_4(task_team->tt.tt_found_tasks)) {
2981  // Already reallocated and initialized.
2982  return FALSE;
2983  }
2984 
2985  threads_data_p = &task_team->tt.tt_threads_data;
2986  nthreads = task_team->tt.tt_nproc;
2987  maxthreads = task_team->tt.tt_max_threads;
2988 
2989  // All threads must lock when they encounter the first task of the implicit
2990  // task region to make sure threads_data fields are (re)initialized before
2991  // used.
2992  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2993 
2994  if (!TCR_4(task_team->tt.tt_found_tasks)) {
2995  // first thread to enable tasking
2996  kmp_team_t *team = thread->th.th_team;
2997  int i;
2998 
2999  is_init_thread = TRUE;
3000  if (maxthreads < nthreads) {
3001 
3002  if (*threads_data_p != NULL) {
3003  kmp_thread_data_t *old_data = *threads_data_p;
3004  kmp_thread_data_t *new_data = NULL;
3005 
3006  KE_TRACE(
3007  10,
3008  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3009  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3010  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3011  // Reallocate threads_data to have more elements than current array
3012  // Cannot use __kmp_thread_realloc() because threads not around for
3013  // kmp_reap_task_team( ). Note all new array entries are initialized
3014  // to zero by __kmp_allocate().
3015  new_data = (kmp_thread_data_t *)__kmp_allocate(
3016  nthreads * sizeof(kmp_thread_data_t));
3017  // copy old data to new data
3018  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3019  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3020 
3021 #ifdef BUILD_TIED_TASK_STACK
3022  // GEH: Figure out if this is the right thing to do
3023  for (i = maxthreads; i < nthreads; i++) {
3024  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3025  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3026  }
3027 #endif // BUILD_TIED_TASK_STACK
3028  // Install the new data and free the old data
3029  (*threads_data_p) = new_data;
3030  __kmp_free(old_data);
3031  } else {
3032  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3033  "threads data for task_team %p, size = %d\n",
3034  __kmp_gtid_from_thread(thread), task_team, nthreads));
3035  // Make the initial allocate for threads_data array, and zero entries
3036  // Cannot use __kmp_thread_calloc() because threads not around for
3037  // kmp_reap_task_team( ).
3038  ANNOTATE_IGNORE_WRITES_BEGIN();
3039  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3040  nthreads * sizeof(kmp_thread_data_t));
3041  ANNOTATE_IGNORE_WRITES_END();
3042 #ifdef BUILD_TIED_TASK_STACK
3043  // GEH: Figure out if this is the right thing to do
3044  for (i = 0; i < nthreads; i++) {
3045  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3046  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3047  }
3048 #endif // BUILD_TIED_TASK_STACK
3049  }
3050  task_team->tt.tt_max_threads = nthreads;
3051  } else {
3052  // If array has (more than) enough elements, go ahead and use it
3053  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3054  }
3055 
3056  // initialize threads_data pointers back to thread_info structures
3057  for (i = 0; i < nthreads; i++) {
3058  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3059  thread_data->td.td_thr = team->t.t_threads[i];
3060 
3061  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3062  // The last stolen field survives across teams / barrier, and the number
3063  // of threads may have changed. It's possible (likely?) that a new
3064  // parallel region will exhibit the same behavior as previous region.
3065  thread_data->td.td_deque_last_stolen = -1;
3066  }
3067  }
3068 
3069  KMP_MB();
3070  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3071  }
3072 
3073  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3074  return is_init_thread;
3075 }
3076 
3077 // __kmp_free_task_threads_data:
3078 // Deallocates a threads_data array for a task team, including any attached
3079 // tasking deques. Only occurs at library shutdown.
3080 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3081  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3082  if (task_team->tt.tt_threads_data != NULL) {
3083  int i;
3084  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3085  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3086  }
3087  __kmp_free(task_team->tt.tt_threads_data);
3088  task_team->tt.tt_threads_data = NULL;
3089  }
3090  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3091 }
3092 
3093 // __kmp_allocate_task_team:
3094 // Allocates a task team associated with a specific team, taking it from
3095 // the global task team free list if possible. Also initializes data
3096 // structures.
3097 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3098  kmp_team_t *team) {
3099  kmp_task_team_t *task_team = NULL;
3100  int nthreads;
3101 
3102  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3103  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3104 
3105  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3106  // Take a task team from the task team pool
3107  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3108  if (__kmp_free_task_teams != NULL) {
3109  task_team = __kmp_free_task_teams;
3110  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3111  task_team->tt.tt_next = NULL;
3112  }
3113  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3114  }
3115 
3116  if (task_team == NULL) {
3117  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3118  "task team for team %p\n",
3119  __kmp_gtid_from_thread(thread), team));
3120  // Allocate a new task team if one is not available.
3121  // Cannot use __kmp_thread_malloc() because threads not around for
3122  // kmp_reap_task_team( ).
3123  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3124  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3125  // AC: __kmp_allocate zeroes returned memory
3126  // task_team -> tt.tt_threads_data = NULL;
3127  // task_team -> tt.tt_max_threads = 0;
3128  // task_team -> tt.tt_next = NULL;
3129  }
3130 
3131  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3132 #if OMP_45_ENABLED
3133  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3134 #endif
3135  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3136 
3137  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3138  TCW_4(task_team->tt.tt_active, TRUE);
3139 
3140  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3141  "unfinished_threads init'd to %d\n",
3142  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3143  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3144  return task_team;
3145 }
3146 
3147 // __kmp_free_task_team:
3148 // Frees the task team associated with a specific thread, and adds it
3149 // to the global task team free list.
3150 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3151  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3152  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3153 
3154  // Put task team back on free list
3155  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3156 
3157  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3158  task_team->tt.tt_next = __kmp_free_task_teams;
3159  TCW_PTR(__kmp_free_task_teams, task_team);
3160 
3161  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3162 }
3163 
3164 // __kmp_reap_task_teams:
3165 // Free all the task teams on the task team free list.
3166 // Should only be done during library shutdown.
3167 // Cannot do anything that needs a thread structure or gtid since they are
3168 // already gone.
3169 void __kmp_reap_task_teams(void) {
3170  kmp_task_team_t *task_team;
3171 
3172  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3173  // Free all task_teams on the free list
3174  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3175  while ((task_team = __kmp_free_task_teams) != NULL) {
3176  __kmp_free_task_teams = task_team->tt.tt_next;
3177  task_team->tt.tt_next = NULL;
3178 
3179  // Free threads_data if necessary
3180  if (task_team->tt.tt_threads_data != NULL) {
3181  __kmp_free_task_threads_data(task_team);
3182  }
3183  __kmp_free(task_team);
3184  }
3185  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3186  }
3187 }
3188 
3189 // __kmp_wait_to_unref_task_teams:
3190 // Some threads could still be in the fork barrier release code, possibly
3191 // trying to steal tasks. Wait for each thread to unreference its task team.
3192 void __kmp_wait_to_unref_task_teams(void) {
3193  kmp_info_t *thread;
3194  kmp_uint32 spins;
3195  int done;
3196 
3197  KMP_INIT_YIELD(spins);
3198 
3199  for (;;) {
3200  done = TRUE;
3201 
3202  // TODO: GEH - this may be is wrong because some sync would be necessary
3203  // in case threads are added to the pool during the traversal. Need to
3204  // verify that lock for thread pool is held when calling this routine.
3205  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3206  thread = thread->th.th_next_pool) {
3207 #if KMP_OS_WINDOWS
3208  DWORD exit_val;
3209 #endif
3210  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3211  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3212  __kmp_gtid_from_thread(thread)));
3213  continue;
3214  }
3215 #if KMP_OS_WINDOWS
3216  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3217  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3218  thread->th.th_task_team = NULL;
3219  continue;
3220  }
3221 #endif
3222 
3223  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3224 
3225  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3226  "unreference task_team\n",
3227  __kmp_gtid_from_thread(thread)));
3228 
3229  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3230  volatile void *sleep_loc;
3231  // If the thread is sleeping, awaken it.
3232  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3233  NULL) {
3234  KA_TRACE(
3235  10,
3236  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3237  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3238  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3239  }
3240  }
3241  }
3242  if (done) {
3243  break;
3244  }
3245 
3246  // If we are oversubscribed, or have waited a bit (and library mode is
3247  // throughput), yield. Pause is in the following code.
3248  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
3249  KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
3250  }
3251 }
3252 
3253 // __kmp_task_team_setup: Create a task_team for the current team, but use
3254 // an already created, unused one if it already exists.
3255 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3256  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3257 
3258  // If this task_team hasn't been created yet, allocate it. It will be used in
3259  // the region after the next.
3260  // If it exists, it is the current task team and shouldn't be touched yet as
3261  // it may still be in use.
3262  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3263  (always || team->t.t_nproc > 1)) {
3264  team->t.t_task_team[this_thr->th.th_task_state] =
3265  __kmp_allocate_task_team(this_thr, team);
3266  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
3267  "for team %d at parity=%d\n",
3268  __kmp_gtid_from_thread(this_thr),
3269  team->t.t_task_team[this_thr->th.th_task_state],
3270  ((team != NULL) ? team->t.t_id : -1),
3271  this_thr->th.th_task_state));
3272  }
3273 
3274  // After threads exit the release, they will call sync, and then point to this
3275  // other task_team; make sure it is allocated and properly initialized. As
3276  // threads spin in the barrier release phase, they will continue to use the
3277  // previous task_team struct(above), until they receive the signal to stop
3278  // checking for tasks (they can't safely reference the kmp_team_t struct,
3279  // which could be reallocated by the master thread). No task teams are formed
3280  // for serialized teams.
3281  if (team->t.t_nproc > 1) {
3282  int other_team = 1 - this_thr->th.th_task_state;
3283  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3284  team->t.t_task_team[other_team] =
3285  __kmp_allocate_task_team(this_thr, team);
3286  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
3287  "task_team %p for team %d at parity=%d\n",
3288  __kmp_gtid_from_thread(this_thr),
3289  team->t.t_task_team[other_team],
3290  ((team != NULL) ? team->t.t_id : -1), other_team));
3291  } else { // Leave the old task team struct in place for the upcoming region;
3292  // adjust as needed
3293  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3294  if (!task_team->tt.tt_active ||
3295  team->t.t_nproc != task_team->tt.tt_nproc) {
3296  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3297  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3298 #if OMP_45_ENABLED
3299  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3300 #endif
3301  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3302  team->t.t_nproc);
3303  TCW_4(task_team->tt.tt_active, TRUE);
3304  }
3305  // if team size has changed, the first thread to enable tasking will
3306  // realloc threads_data if necessary
3307  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
3308  "%p for team %d at parity=%d\n",
3309  __kmp_gtid_from_thread(this_thr),
3310  team->t.t_task_team[other_team],
3311  ((team != NULL) ? team->t.t_id : -1), other_team));
3312  }
3313  }
3314 }
3315 
3316 // __kmp_task_team_sync: Propagation of task team data from team to threads
3317 // which happens just after the release phase of a team barrier. This may be
3318 // called by any thread, but only for teams with # threads > 1.
3319 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3320  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3321 
3322  // Toggle the th_task_state field, to switch which task_team this thread
3323  // refers to
3324  this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3325  // It is now safe to propagate the task team pointer from the team struct to
3326  // the current thread.
3327  TCW_PTR(this_thr->th.th_task_team,
3328  team->t.t_task_team[this_thr->th.th_task_state]);
3329  KA_TRACE(20,
3330  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3331  "%p from Team #%d (parity=%d)\n",
3332  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3333  ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3334 }
3335 
3336 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
3337 // barrier gather phase. Only called by master thread if #threads in team > 1 or
3338 // if proxy tasks were created.
3339 //
3340 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3341 // by passing in 0 optionally as the last argument. When wait is zero, master
3342 // thread does not wait for unfinished_threads to reach 0.
3343 void __kmp_task_team_wait(
3344  kmp_info_t *this_thr,
3345  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3346  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3347 
3348  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3349  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3350 
3351  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3352  if (wait) {
3353  KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
3354  "(for unfinished_threads to reach 0) on task_team = %p\n",
3355  __kmp_gtid_from_thread(this_thr), task_team));
3356  // Worker threads may have dropped through to release phase, but could
3357  // still be executing tasks. Wait here for tasks to complete. To avoid
3358  // memory contention, only master thread checks termination condition.
3359  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3360  &task_team->tt.tt_unfinished_threads),
3361  0U);
3362  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3363  }
3364  // Deactivate the old task team, so that the worker threads will stop
3365  // referencing it while spinning.
3366  KA_TRACE(
3367  20,
3368  ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3369  "setting active to false, setting local and team's pointer to NULL\n",
3370  __kmp_gtid_from_thread(this_thr), task_team));
3371 #if OMP_45_ENABLED
3372  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3373  task_team->tt.tt_found_proxy_tasks == TRUE);
3374  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3375 #else
3376  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
3377 #endif
3378  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3379  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3380  KMP_MB();
3381 
3382  TCW_PTR(this_thr->th.th_task_team, NULL);
3383  }
3384 }
3385 
3386 // __kmp_tasking_barrier:
3387 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
3388 // Internal function to execute all tasks prior to a regular barrier or a join
3389 // barrier. It is a full barrier itself, which unfortunately turns regular
3390 // barriers into double barriers and join barriers into 1 1/2 barriers.
3391 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3392  std::atomic<kmp_uint32> *spin = RCAST(
3393  std::atomic<kmp_uint32> *,
3394  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3395  int flag = FALSE;
3396  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3397 
3398 #if USE_ITT_BUILD
3399  KMP_FSYNC_SPIN_INIT(spin, NULL);
3400 #endif /* USE_ITT_BUILD */
3401  kmp_flag_32 spin_flag(spin, 0U);
3402  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3403  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3404 #if USE_ITT_BUILD
3405  // TODO: What about itt_sync_obj??
3406  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3407 #endif /* USE_ITT_BUILD */
3408 
3409  if (TCR_4(__kmp_global.g.g_done)) {
3410  if (__kmp_global.g.g_abort)
3411  __kmp_abort_thread();
3412  break;
3413  }
3414  KMP_YIELD(TRUE); // GH: We always yield here
3415  }
3416 #if USE_ITT_BUILD
3417  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3418 #endif /* USE_ITT_BUILD */
3419 }
3420 
3421 #if OMP_45_ENABLED
3422 
3423 // __kmp_give_task puts a task into a given thread queue if:
3424 // - the queue for that thread was created
3425 // - there's space in that queue
3426 // Because of this, __kmp_push_task needs to check if there's space after
3427 // getting the lock
3428 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3429  kmp_int32 pass) {
3430  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3431  kmp_task_team_t *task_team = taskdata->td_task_team;
3432 
3433  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3434  taskdata, tid));
3435 
3436  // If task_team is NULL something went really bad...
3437  KMP_DEBUG_ASSERT(task_team != NULL);
3438 
3439  bool result = false;
3440  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3441 
3442  if (thread_data->td.td_deque == NULL) {
3443  // There's no queue in this thread, go find another one
3444  // We're guaranteed that at least one thread has a queue
3445  KA_TRACE(30,
3446  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3447  tid, taskdata));
3448  return result;
3449  }
3450 
3451  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3452  TASK_DEQUE_SIZE(thread_data->td)) {
3453  KA_TRACE(
3454  30,
3455  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3456  taskdata, tid));
3457 
3458  // if this deque is bigger than the pass ratio give a chance to another
3459  // thread
3460  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3461  return result;
3462 
3463  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3464  __kmp_realloc_task_deque(thread, thread_data);
3465 
3466  } else {
3467 
3468  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3469 
3470  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3471  TASK_DEQUE_SIZE(thread_data->td)) {
3472  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3473  "thread %d.\n",
3474  taskdata, tid));
3475 
3476  // if this deque is bigger than the pass ratio give a chance to another
3477  // thread
3478  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3479  goto release_and_exit;
3480 
3481  __kmp_realloc_task_deque(thread, thread_data);
3482  }
3483  }
3484 
3485  // lock is held here, and there is space in the deque
3486 
3487  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3488  // Wrap index.
3489  thread_data->td.td_deque_tail =
3490  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3491  TCW_4(thread_data->td.td_deque_ntasks,
3492  TCR_4(thread_data->td.td_deque_ntasks) + 1);
3493 
3494  result = true;
3495  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3496  taskdata, tid));
3497 
3498 release_and_exit:
3499  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3500 
3501  return result;
3502 }
3503 
3504 /* The finish of the proxy tasks is divided in two pieces:
3505  - the top half is the one that can be done from a thread outside the team
3506  - the bottom half must be run from a thread within the team
3507 
3508  In order to run the bottom half the task gets queued back into one of the
3509  threads of the team. Once the td_incomplete_child_task counter of the parent
3510  is decremented the threads can leave the barriers. So, the bottom half needs
3511  to be queued before the counter is decremented. The top half is therefore
3512  divided in two parts:
3513  - things that can be run before queuing the bottom half
3514  - things that must be run after queuing the bottom half
3515 
3516  This creates a second race as the bottom half can free the task before the
3517  second top half is executed. To avoid this we use the
3518  td_incomplete_child_task of the proxy task to synchronize the top and bottom
3519  half. */
3520 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3521  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3522  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3523  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3524  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3525 
3526  taskdata->td_flags.complete = 1; // mark the task as completed
3527 
3528  if (taskdata->td_taskgroup)
3529  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3530 
3531  // Create an imaginary children for this task so the bottom half cannot
3532  // release the task before we have completed the second top half
3533  KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3534 }
3535 
3536 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3537  kmp_int32 children = 0;
3538 
3539  // Predecrement simulated by "- 1" calculation
3540  children =
3541  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3542  KMP_DEBUG_ASSERT(children >= 0);
3543 
3544  // Remove the imaginary children
3545  KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3546 }
3547 
3548 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3549  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3550  kmp_info_t *thread = __kmp_threads[gtid];
3551 
3552  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3553  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3554  1); // top half must run before bottom half
3555 
3556  // We need to wait to make sure the top half is finished
3557  // Spinning here should be ok as this should happen quickly
3558  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3559  ;
3560 
3561  __kmp_release_deps(gtid, taskdata);
3562  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3563 }
3564 
3573 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3574  KMP_DEBUG_ASSERT(ptask != NULL);
3575  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3576  KA_TRACE(
3577  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3578  gtid, taskdata));
3579 
3580  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3581 
3582  __kmp_first_top_half_finish_proxy(taskdata);
3583  __kmp_second_top_half_finish_proxy(taskdata);
3584  __kmp_bottom_half_finish_proxy(gtid, ptask);
3585 
3586  KA_TRACE(10,
3587  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3588  gtid, taskdata));
3589 }
3590 
3598 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3599  KMP_DEBUG_ASSERT(ptask != NULL);
3600  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3601 
3602  KA_TRACE(
3603  10,
3604  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3605  taskdata));
3606 
3607  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3608 
3609  __kmp_first_top_half_finish_proxy(taskdata);
3610 
3611  // Enqueue task to complete bottom half completion from a thread within the
3612  // corresponding team
3613  kmp_team_t *team = taskdata->td_team;
3614  kmp_int32 nthreads = team->t.t_nproc;
3615  kmp_info_t *thread;
3616 
3617  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3618  // but we cannot use __kmp_get_random here
3619  kmp_int32 start_k = 0;
3620  kmp_int32 pass = 1;
3621  kmp_int32 k = start_k;
3622 
3623  do {
3624  // For now we're just linearly trying to find a thread
3625  thread = team->t.t_threads[k];
3626  k = (k + 1) % nthreads;
3627 
3628  // we did a full pass through all the threads
3629  if (k == start_k)
3630  pass = pass << 1;
3631 
3632  } while (!__kmp_give_task(thread, k, ptask, pass));
3633 
3634  __kmp_second_top_half_finish_proxy(taskdata);
3635 
3636  KA_TRACE(
3637  10,
3638  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3639  taskdata));
3640 }
3641 
3642 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3643 // for taskloop
3644 //
3645 // thread: allocating thread
3646 // task_src: pointer to source task to be duplicated
3647 // returns: a pointer to the allocated kmp_task_t structure (task).
3648 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3649  kmp_task_t *task;
3650  kmp_taskdata_t *taskdata;
3651  kmp_taskdata_t *taskdata_src;
3652  kmp_taskdata_t *parent_task = thread->th.th_current_task;
3653  size_t shareds_offset;
3654  size_t task_size;
3655 
3656  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3657  task_src));
3658  taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3659  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3660  TASK_FULL); // it should not be proxy task
3661  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3662  task_size = taskdata_src->td_size_alloc;
3663 
3664  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3665  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3666  task_size));
3667 #if USE_FAST_MEMORY
3668  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3669 #else
3670  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3671 #endif /* USE_FAST_MEMORY */
3672  KMP_MEMCPY(taskdata, taskdata_src, task_size);
3673 
3674  task = KMP_TASKDATA_TO_TASK(taskdata);
3675 
3676  // Initialize new task (only specific fields not affected by memcpy)
3677  taskdata->td_task_id = KMP_GEN_TASK_ID();
3678  if (task->shareds != NULL) { // need setup shareds pointer
3679  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3680  task->shareds = &((char *)taskdata)[shareds_offset];
3681  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3682  0);
3683  }
3684  taskdata->td_alloc_thread = thread;
3685  taskdata->td_parent = parent_task;
3686  taskdata->td_taskgroup =
3687  parent_task
3688  ->td_taskgroup; // task inherits the taskgroup from the parent task
3689 
3690  // Only need to keep track of child task counts if team parallel and tasking
3691  // not serialized
3692  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3693  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
3694  if (parent_task->td_taskgroup)
3695  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
3696  // Only need to keep track of allocated child tasks for explicit tasks since
3697  // implicit not deallocated
3698  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3699  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
3700  }
3701 
3702  KA_TRACE(20,
3703  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3704  thread, taskdata, taskdata->td_parent));
3705 #if OMPT_SUPPORT
3706  if (UNLIKELY(ompt_enabled.enabled))
3707  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
3708 #endif
3709  return task;
3710 }
3711 
3712 // Routine optionally generated by the compiler for setting the lastprivate flag
3713 // and calling needed constructors for private/firstprivate objects
3714 // (used to form taskloop tasks from pattern task)
3715 // Parameters: dest task, src task, lastprivate flag.
3716 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3717 
3718 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
3719 
3720 // class to encapsulate manipulating loop bounds in a taskloop task.
3721 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
3722 // the loop bound variables.
3723 class kmp_taskloop_bounds_t {
3724  kmp_task_t *task;
3725  const kmp_taskdata_t *taskdata;
3726  size_t lower_offset;
3727  size_t upper_offset;
3728 
3729 public:
3730  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
3731  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
3732  lower_offset((char *)lb - (char *)task),
3733  upper_offset((char *)ub - (char *)task) {
3734  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
3735  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
3736  }
3737  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
3738  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
3739  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
3740  size_t get_lower_offset() const { return lower_offset; }
3741  size_t get_upper_offset() const { return upper_offset; }
3742  kmp_uint64 get_lb() const {
3743  kmp_int64 retval;
3744 #if defined(KMP_GOMP_COMPAT)
3745  // Intel task just returns the lower bound normally
3746  if (!taskdata->td_flags.native) {
3747  retval = *(kmp_int64 *)((char *)task + lower_offset);
3748  } else {
3749  // GOMP task has to take into account the sizeof(long)
3750  if (taskdata->td_size_loop_bounds == 4) {
3751  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
3752  retval = (kmp_int64)*lb;
3753  } else {
3754  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
3755  retval = (kmp_int64)*lb;
3756  }
3757  }
3758 #else
3759  retval = *(kmp_int64 *)((char *)task + lower_offset);
3760 #endif // defined(KMP_GOMP_COMPAT)
3761  return retval;
3762  }
3763  kmp_uint64 get_ub() const {
3764  kmp_int64 retval;
3765 #if defined(KMP_GOMP_COMPAT)
3766  // Intel task just returns the upper bound normally
3767  if (!taskdata->td_flags.native) {
3768  retval = *(kmp_int64 *)((char *)task + upper_offset);
3769  } else {
3770  // GOMP task has to take into account the sizeof(long)
3771  if (taskdata->td_size_loop_bounds == 4) {
3772  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
3773  retval = (kmp_int64)*ub;
3774  } else {
3775  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
3776  retval = (kmp_int64)*ub;
3777  }
3778  }
3779 #else
3780  retval = *(kmp_int64 *)((char *)task + upper_offset);
3781 #endif // defined(KMP_GOMP_COMPAT)
3782  return retval;
3783  }
3784  void set_lb(kmp_uint64 lb) {
3785 #if defined(KMP_GOMP_COMPAT)
3786  // Intel task just sets the lower bound normally
3787  if (!taskdata->td_flags.native) {
3788  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
3789  } else {
3790  // GOMP task has to take into account the sizeof(long)
3791  if (taskdata->td_size_loop_bounds == 4) {
3792  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
3793  *lower = (kmp_uint32)lb;
3794  } else {
3795  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
3796  *lower = (kmp_uint64)lb;
3797  }
3798  }
3799 #else
3800  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
3801 #endif // defined(KMP_GOMP_COMPAT)
3802  }
3803  void set_ub(kmp_uint64 ub) {
3804 #if defined(KMP_GOMP_COMPAT)
3805  // Intel task just sets the upper bound normally
3806  if (!taskdata->td_flags.native) {
3807  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
3808  } else {
3809  // GOMP task has to take into account the sizeof(long)
3810  if (taskdata->td_size_loop_bounds == 4) {
3811  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
3812  *upper = (kmp_uint32)ub;
3813  } else {
3814  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
3815  *upper = (kmp_uint64)ub;
3816  }
3817  }
3818 #else
3819  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
3820 #endif // defined(KMP_GOMP_COMPAT)
3821  }
3822 };
3823 
3824 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
3825 //
3826 // loc Source location information
3827 // gtid Global thread ID
3828 // task Pattern task, exposes the loop iteration range
3829 // lb Pointer to loop lower bound in task structure
3830 // ub Pointer to loop upper bound in task structure
3831 // st Loop stride
3832 // ub_glob Global upper bound (used for lastprivate check)
3833 // num_tasks Number of tasks to execute
3834 // grainsize Number of loop iterations per task
3835 // extras Number of chunks with grainsize+1 iterations
3836 // tc Iterations count
3837 // task_dup Tasks duplication routine
3838 // codeptr_ra Return address for OMPT events
3839 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3840  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3841  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
3842  kmp_uint64 grainsize, kmp_uint64 extras,
3843  kmp_uint64 tc,
3844 #if OMPT_SUPPORT
3845  void *codeptr_ra,
3846 #endif
3847  void *task_dup) {
3848  KMP_COUNT_BLOCK(OMP_TASKLOOP);
3849  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3850  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3851  // compiler provides global bounds here
3852  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
3853  kmp_uint64 lower = task_bounds.get_lb();
3854  kmp_uint64 upper = task_bounds.get_ub();
3855  kmp_uint64 i;
3856  kmp_info_t *thread = __kmp_threads[gtid];
3857  kmp_taskdata_t *current_task = thread->th.th_current_task;
3858  kmp_task_t *next_task;
3859  kmp_int32 lastpriv = 0;
3860 
3861  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3862  KMP_DEBUG_ASSERT(num_tasks > extras);
3863  KMP_DEBUG_ASSERT(num_tasks > 0);
3864  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
3865  "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
3866  gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
3867  task_dup));
3868 
3869  // Launch num_tasks tasks, assign grainsize iterations each task
3870  for (i = 0; i < num_tasks; ++i) {
3871  kmp_uint64 chunk_minus_1;
3872  if (extras == 0) {
3873  chunk_minus_1 = grainsize - 1;
3874  } else {
3875  chunk_minus_1 = grainsize;
3876  --extras; // first extras iterations get bigger chunk (grainsize+1)
3877  }
3878  upper = lower + st * chunk_minus_1;
3879  if (i == num_tasks - 1) {
3880  // schedule the last task, set lastprivate flag if needed
3881  if (st == 1) { // most common case
3882  KMP_DEBUG_ASSERT(upper == *ub);
3883  if (upper == ub_glob)
3884  lastpriv = 1;
3885  } else if (st > 0) { // positive loop stride
3886  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
3887  if ((kmp_uint64)st > ub_glob - upper)
3888  lastpriv = 1;
3889  } else { // negative loop stride
3890  KMP_DEBUG_ASSERT(upper + st < *ub);
3891  if (upper - ub_glob < (kmp_uint64)(-st))
3892  lastpriv = 1;
3893  }
3894  }
3895  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3896  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
3897  kmp_taskloop_bounds_t next_task_bounds =
3898  kmp_taskloop_bounds_t(next_task, task_bounds);
3899 
3900  // adjust task-specific bounds
3901  next_task_bounds.set_lb(lower);
3902  if (next_taskdata->td_flags.native) {
3903  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
3904  } else {
3905  next_task_bounds.set_ub(upper);
3906  }
3907  if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
3908  ptask_dup(next_task, task, lastpriv);
3909  KA_TRACE(40,
3910  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
3911  "upper %lld stride %lld, (offsets %p %p)\n",
3912  gtid, i, next_task, lower, upper, st,
3913  next_task_bounds.get_lower_offset(),
3914  next_task_bounds.get_upper_offset()));
3915 #if OMPT_SUPPORT
3916  __kmp_omp_taskloop_task(NULL, gtid, next_task,
3917  codeptr_ra); // schedule new task
3918 #else
3919  __kmp_omp_task(gtid, next_task, true); // schedule new task
3920 #endif
3921  lower = upper + st; // adjust lower bound for the next iteration
3922  }
3923  // free the pattern task and exit
3924  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
3925  // do not execute the pattern task, just do internal bookkeeping
3926  __kmp_task_finish<false>(gtid, task, current_task);
3927 }
3928 
3929 // Structure to keep taskloop parameters for auxiliary task
3930 // kept in the shareds of the task structure.
3931 typedef struct __taskloop_params {
3932  kmp_task_t *task;
3933  kmp_uint64 *lb;
3934  kmp_uint64 *ub;
3935  void *task_dup;
3936  kmp_int64 st;
3937  kmp_uint64 ub_glob;
3938  kmp_uint64 num_tasks;
3939  kmp_uint64 grainsize;
3940  kmp_uint64 extras;
3941  kmp_uint64 tc;
3942  kmp_uint64 num_t_min;
3943 #if OMPT_SUPPORT
3944  void *codeptr_ra;
3945 #endif
3946 } __taskloop_params_t;
3947 
3948 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
3949  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
3950  kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
3951 #if OMPT_SUPPORT
3952  void *,
3953 #endif
3954  void *);
3955 
3956 // Execute part of the the taskloop submitted as a task.
3957 int __kmp_taskloop_task(int gtid, void *ptask) {
3958  __taskloop_params_t *p =
3959  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
3960  kmp_task_t *task = p->task;
3961  kmp_uint64 *lb = p->lb;
3962  kmp_uint64 *ub = p->ub;
3963  void *task_dup = p->task_dup;
3964  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3965  kmp_int64 st = p->st;
3966  kmp_uint64 ub_glob = p->ub_glob;
3967  kmp_uint64 num_tasks = p->num_tasks;
3968  kmp_uint64 grainsize = p->grainsize;
3969  kmp_uint64 extras = p->extras;
3970  kmp_uint64 tc = p->tc;
3971  kmp_uint64 num_t_min = p->num_t_min;
3972 #if OMPT_SUPPORT
3973  void *codeptr_ra = p->codeptr_ra;
3974 #endif
3975 #if KMP_DEBUG
3976  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3977  KMP_DEBUG_ASSERT(task != NULL);
3978  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
3979  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
3980  gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
3981  task_dup));
3982 #endif
3983  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
3984  if (num_tasks > num_t_min)
3985  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3986  grainsize, extras, tc, num_t_min,
3987 #if OMPT_SUPPORT
3988  codeptr_ra,
3989 #endif
3990  task_dup);
3991  else
3992  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3993  grainsize, extras, tc,
3994 #if OMPT_SUPPORT
3995  codeptr_ra,
3996 #endif
3997  task_dup);
3998 
3999  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4000  return 0;
4001 }
4002 
4003 // Schedule part of the the taskloop as a task,
4004 // execute the rest of the the taskloop.
4005 //
4006 // loc Source location information
4007 // gtid Global thread ID
4008 // task Pattern task, exposes the loop iteration range
4009 // lb Pointer to loop lower bound in task structure
4010 // ub Pointer to loop upper bound in task structure
4011 // st Loop stride
4012 // ub_glob Global upper bound (used for lastprivate check)
4013 // num_tasks Number of tasks to execute
4014 // grainsize Number of loop iterations per task
4015 // extras Number of chunks with grainsize+1 iterations
4016 // tc Iterations count
4017 // num_t_min Threashold to launch tasks recursively
4018 // task_dup Tasks duplication routine
4019 // codeptr_ra Return address for OMPT events
4020 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4021  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4022  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4023  kmp_uint64 grainsize, kmp_uint64 extras,
4024  kmp_uint64 tc, kmp_uint64 num_t_min,
4025 #if OMPT_SUPPORT
4026  void *codeptr_ra,
4027 #endif
4028  void *task_dup) {
4029 #if KMP_DEBUG
4030  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4031  KMP_DEBUG_ASSERT(task != NULL);
4032  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4033  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4034  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4035  gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4036  task_dup));
4037 #endif
4038  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4039  kmp_uint64 lower = *lb;
4040  kmp_info_t *thread = __kmp_threads[gtid];
4041  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4042  kmp_task_t *next_task;
4043  size_t lower_offset =
4044  (char *)lb - (char *)task; // remember offset of lb in the task structure
4045  size_t upper_offset =
4046  (char *)ub - (char *)task; // remember offset of ub in the task structure
4047 
4048  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4049  KMP_DEBUG_ASSERT(num_tasks > extras);
4050  KMP_DEBUG_ASSERT(num_tasks > 0);
4051 
4052  // split the loop in two halves
4053  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4054  kmp_uint64 gr_size0 = grainsize;
4055  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4056  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4057  if (n_tsk0 <= extras) {
4058  gr_size0++; // integrate extras into grainsize
4059  ext0 = 0; // no extra iters in 1st half
4060  ext1 = extras - n_tsk0; // remaining extras
4061  tc0 = gr_size0 * n_tsk0;
4062  tc1 = tc - tc0;
4063  } else { // n_tsk0 > extras
4064  ext1 = 0; // no extra iters in 2nd half
4065  ext0 = extras;
4066  tc1 = grainsize * n_tsk1;
4067  tc0 = tc - tc1;
4068  }
4069  ub0 = lower + st * (tc0 - 1);
4070  lb1 = ub0 + st;
4071 
4072  // create pattern task for 2nd half of the loop
4073  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4074  // adjust lower bound (upper bound is not changed) for the 2nd half
4075  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4076  if (ptask_dup != NULL) // construct fistprivates, etc.
4077  ptask_dup(next_task, task, 0);
4078  *ub = ub0; // adjust upper bound for the 1st half
4079 
4080  // create auxiliary task for 2nd half of the loop
4081  kmp_task_t *new_task =
4082  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4083  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4084  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4085  p->task = next_task;
4086  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4087  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4088  p->task_dup = task_dup;
4089  p->st = st;
4090  p->ub_glob = ub_glob;
4091  p->num_tasks = n_tsk1;
4092  p->grainsize = grainsize;
4093  p->extras = ext1;
4094  p->tc = tc1;
4095  p->num_t_min = num_t_min;
4096 #if OMPT_SUPPORT
4097  p->codeptr_ra = codeptr_ra;
4098 #endif
4099 
4100 #if OMPT_SUPPORT
4101  // schedule new task with correct return address for OMPT events
4102  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4103 #else
4104  __kmp_omp_task(gtid, new_task, true); // schedule new task
4105 #endif
4106 
4107  // execute the 1st half of current subrange
4108  if (n_tsk0 > num_t_min)
4109  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4110  ext0, tc0, num_t_min,
4111 #if OMPT_SUPPORT
4112  codeptr_ra,
4113 #endif
4114  task_dup);
4115  else
4116  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4117  gr_size0, ext0, tc0,
4118 #if OMPT_SUPPORT
4119  codeptr_ra,
4120 #endif
4121  task_dup);
4122 
4123  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4124 }
4125 
4142 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4143  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4144  int sched, kmp_uint64 grainsize, void *task_dup) {
4145  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4146  KMP_DEBUG_ASSERT(task != NULL);
4147 
4148  if (nogroup == 0) {
4149 #if OMPT_SUPPORT && OMPT_OPTIONAL
4150  OMPT_STORE_RETURN_ADDRESS(gtid);
4151 #endif
4152  __kmpc_taskgroup(loc, gtid);
4153  }
4154 
4155  // =========================================================================
4156  // calculate loop parameters
4157  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4158  kmp_uint64 tc;
4159  // compiler provides global bounds here
4160  kmp_uint64 lower = task_bounds.get_lb();
4161  kmp_uint64 upper = task_bounds.get_ub();
4162  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4163  kmp_uint64 num_tasks = 0, extras = 0;
4164  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4165  kmp_info_t *thread = __kmp_threads[gtid];
4166  kmp_taskdata_t *current_task = thread->th.th_current_task;
4167 
4168  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4169  "grain %llu(%d), dup %p\n",
4170  gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4171 
4172  // compute trip count
4173  if (st == 1) { // most common case
4174  tc = upper - lower + 1;
4175  } else if (st < 0) {
4176  tc = (lower - upper) / (-st) + 1;
4177  } else { // st > 0
4178  tc = (upper - lower) / st + 1;
4179  }
4180  if (tc == 0) {
4181  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4182  // free the pattern task and exit
4183  __kmp_task_start(gtid, task, current_task);
4184  // do not execute anything for zero-trip loop
4185  __kmp_task_finish<false>(gtid, task, current_task);
4186  return;
4187  }
4188 
4189 #if OMPT_SUPPORT && OMPT_OPTIONAL
4190  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4191  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4192  if (ompt_enabled.ompt_callback_work) {
4193  ompt_callbacks.ompt_callback(ompt_callback_work)(
4194  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4195  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4196  }
4197 #endif
4198 
4199  if (num_tasks_min == 0)
4200  // TODO: can we choose better default heuristic?
4201  num_tasks_min =
4202  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4203 
4204  // compute num_tasks/grainsize based on the input provided
4205  switch (sched) {
4206  case 0: // no schedule clause specified, we can choose the default
4207  // let's try to schedule (team_size*10) tasks
4208  grainsize = thread->th.th_team_nproc * 10;
4209  case 2: // num_tasks provided
4210  if (grainsize > tc) {
4211  num_tasks = tc; // too big num_tasks requested, adjust values
4212  grainsize = 1;
4213  extras = 0;
4214  } else {
4215  num_tasks = grainsize;
4216  grainsize = tc / num_tasks;
4217  extras = tc % num_tasks;
4218  }
4219  break;
4220  case 1: // grainsize provided
4221  if (grainsize > tc) {
4222  num_tasks = 1; // too big grainsize requested, adjust values
4223  grainsize = tc;
4224  extras = 0;
4225  } else {
4226  num_tasks = tc / grainsize;
4227  // adjust grainsize for balanced distribution of iterations
4228  grainsize = tc / num_tasks;
4229  extras = tc % num_tasks;
4230  }
4231  break;
4232  default:
4233  KMP_ASSERT2(0, "unknown scheduling of taskloop");
4234  }
4235  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4236  KMP_DEBUG_ASSERT(num_tasks > extras);
4237  KMP_DEBUG_ASSERT(num_tasks > 0);
4238  // =========================================================================
4239 
4240  // check if clause value first
4241  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4242  if (if_val == 0) { // if(0) specified, mark task as serial
4243  taskdata->td_flags.task_serial = 1;
4244  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4245  // always start serial tasks linearly
4246  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4247  grainsize, extras, tc,
4248 #if OMPT_SUPPORT
4249  OMPT_GET_RETURN_ADDRESS(0),
4250 #endif
4251  task_dup);
4252  // !taskdata->td_flags.native => currently force linear spawning of tasks
4253  // for GOMP_taskloop
4254  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4255  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4256  "(%lld), grain %llu, extras %llu\n",
4257  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4258  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4259  grainsize, extras, tc, num_tasks_min,
4260 #if OMPT_SUPPORT
4261  OMPT_GET_RETURN_ADDRESS(0),
4262 #endif
4263  task_dup);
4264  } else {
4265  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4266  "(%lld), grain %llu, extras %llu\n",
4267  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4268  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4269  grainsize, extras, tc,
4270 #if OMPT_SUPPORT
4271  OMPT_GET_RETURN_ADDRESS(0),
4272 #endif
4273  task_dup);
4274  }
4275 
4276 #if OMPT_SUPPORT && OMPT_OPTIONAL
4277  if (ompt_enabled.ompt_callback_work) {
4278  ompt_callbacks.ompt_callback(ompt_callback_work)(
4279  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4280  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4281  }
4282 #endif
4283 
4284  if (nogroup == 0) {
4285 #if OMPT_SUPPORT && OMPT_OPTIONAL
4286  OMPT_STORE_RETURN_ADDRESS(gtid);
4287 #endif
4288  __kmpc_end_taskgroup(loc, gtid);
4289  }
4290  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4291 }
4292 
4293 #endif
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:890
Definition: kmp.h:224