LLVM OpenMP* Runtime Library
kmp_tasking.c
1 /*
2  * kmp_tasking.c -- OpenMP 3.0 tasking support.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_wait_release.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 
27 /* ------------------------------------------------------------------------ */
28 /* ------------------------------------------------------------------------ */
29 
30 
31 /* forward declaration */
32 static void __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr );
33 static void __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data );
34 static int __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team );
35 
36 #ifdef OMP_41_ENABLED
37 static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask );
38 #endif
39 
40 static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
41  if (!flag) return;
42  // Attempt to wake up a thread: examine its type and call appropriate template
43  switch (((kmp_flag_64 *)flag)->get_type()) {
44  case flag32: __kmp_resume_32(gtid, NULL); break;
45  case flag64: __kmp_resume_64(gtid, NULL); break;
46  case flag_oncore: __kmp_resume_oncore(gtid, NULL); break;
47  }
48 }
49 
50 #ifdef BUILD_TIED_TASK_STACK
51 
52 //---------------------------------------------------------------------------
53 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
54 // from top do bottom
55 //
56 // gtid: global thread identifier for thread containing stack
57 // thread_data: thread data for task team thread containing stack
58 // threshold: value above which the trace statement triggers
59 // location: string identifying call site of this function (for trace)
60 
61 static void
62 __kmp_trace_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data, int threshold, char *location )
63 {
64  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
65  kmp_taskdata_t **stack_top = task_stack -> ts_top;
66  kmp_int32 entries = task_stack -> ts_entries;
67  kmp_taskdata_t *tied_task;
68 
69  KA_TRACE(threshold, ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
70  "first_block = %p, stack_top = %p \n",
71  location, gtid, entries, task_stack->ts_first_block, stack_top ) );
72 
73  KMP_DEBUG_ASSERT( stack_top != NULL );
74  KMP_DEBUG_ASSERT( entries > 0 );
75 
76  while ( entries != 0 )
77  {
78  KMP_DEBUG_ASSERT( stack_top != & task_stack->ts_first_block.sb_block[0] );
79  // fix up ts_top if we need to pop from previous block
80  if ( entries & TASK_STACK_INDEX_MASK == 0 )
81  {
82  kmp_stack_block_t *stack_block = (kmp_stack_block_t *) (stack_top) ;
83 
84  stack_block = stack_block -> sb_prev;
85  stack_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
86  }
87 
88  // finish bookkeeping
89  stack_top--;
90  entries--;
91 
92  tied_task = * stack_top;
93 
94  KMP_DEBUG_ASSERT( tied_task != NULL );
95  KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
96 
97  KA_TRACE(threshold, ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
98  "stack_top=%p, tied_task=%p\n",
99  location, gtid, entries, stack_top, tied_task ) );
100  }
101  KMP_DEBUG_ASSERT( stack_top == & task_stack->ts_first_block.sb_block[0] );
102 
103  KA_TRACE(threshold, ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
104  location, gtid ) );
105 }
106 
107 //---------------------------------------------------------------------------
108 // __kmp_init_task_stack: initialize the task stack for the first time
109 // after a thread_data structure is created.
110 // It should not be necessary to do this again (assuming the stack works).
111 //
112 // gtid: global thread identifier of calling thread
113 // thread_data: thread data for task team thread containing stack
114 
115 static void
116 __kmp_init_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
117 {
118  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
119  kmp_stack_block_t *first_block;
120 
121  // set up the first block of the stack
122  first_block = & task_stack -> ts_first_block;
123  task_stack -> ts_top = (kmp_taskdata_t **) first_block;
124  memset( (void *) first_block, '\0', TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
125 
126  // initialize the stack to be empty
127  task_stack -> ts_entries = TASK_STACK_EMPTY;
128  first_block -> sb_next = NULL;
129  first_block -> sb_prev = NULL;
130 }
131 
132 
133 //---------------------------------------------------------------------------
134 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
135 //
136 // gtid: global thread identifier for calling thread
137 // thread_data: thread info for thread containing stack
138 
139 static void
140 __kmp_free_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
141 {
142  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
143  kmp_stack_block_t *stack_block = & task_stack -> ts_first_block;
144 
145  KMP_DEBUG_ASSERT( task_stack -> ts_entries == TASK_STACK_EMPTY );
146  // free from the second block of the stack
147  while ( stack_block != NULL ) {
148  kmp_stack_block_t *next_block = (stack_block) ? stack_block -> sb_next : NULL;
149 
150  stack_block -> sb_next = NULL;
151  stack_block -> sb_prev = NULL;
152  if (stack_block != & task_stack -> ts_first_block) {
153  __kmp_thread_free( thread, stack_block ); // free the block, if not the first
154  }
155  stack_block = next_block;
156  }
157  // initialize the stack to be empty
158  task_stack -> ts_entries = 0;
159  task_stack -> ts_top = NULL;
160 }
161 
162 
163 //---------------------------------------------------------------------------
164 // __kmp_push_task_stack: Push the tied task onto the task stack.
165 // Grow the stack if necessary by allocating another block.
166 //
167 // gtid: global thread identifier for calling thread
168 // thread: thread info for thread containing stack
169 // tied_task: the task to push on the stack
170 
171 static void
172 __kmp_push_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t * tied_task )
173 {
174  // GEH - need to consider what to do if tt_threads_data not allocated yet
175  kmp_thread_data_t *thread_data = & thread -> th.th_task_team ->
176  tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
177  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
178 
179  if ( tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser ) {
180  return; // Don't push anything on stack if team or team tasks are serialized
181  }
182 
183  KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
184  KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
185 
186  KA_TRACE(20, ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
187  gtid, thread, tied_task ) );
188  // Store entry
189  * (task_stack -> ts_top) = tied_task;
190 
191  // Do bookkeeping for next push
192  task_stack -> ts_top++;
193  task_stack -> ts_entries++;
194 
195  if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
196  {
197  // Find beginning of this task block
198  kmp_stack_block_t *stack_block =
199  (kmp_stack_block_t *) (task_stack -> ts_top - TASK_STACK_BLOCK_SIZE);
200 
201  // Check if we already have a block
202  if ( stack_block -> sb_next != NULL )
203  { // reset ts_top to beginning of next block
204  task_stack -> ts_top = & stack_block -> sb_next -> sb_block[0];
205  }
206  else
207  { // Alloc new block and link it up
208  kmp_stack_block_t *new_block = (kmp_stack_block_t *)
209  __kmp_thread_calloc(thread, sizeof(kmp_stack_block_t));
210 
211  task_stack -> ts_top = & new_block -> sb_block[0];
212  stack_block -> sb_next = new_block;
213  new_block -> sb_prev = stack_block;
214  new_block -> sb_next = NULL;
215 
216  KA_TRACE(30, ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
217  gtid, tied_task, new_block ) );
218  }
219  }
220  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
221 }
222 
223 //---------------------------------------------------------------------------
224 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
225 // the task, just check to make sure it matches the ending task passed in.
226 //
227 // gtid: global thread identifier for the calling thread
228 // thread: thread info structure containing stack
229 // tied_task: the task popped off the stack
230 // ending_task: the task that is ending (should match popped task)
231 
232 static void
233 __kmp_pop_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t *ending_task )
234 {
235  // GEH - need to consider what to do if tt_threads_data not allocated yet
236  kmp_thread_data_t *thread_data = & thread -> th.th_task_team -> tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
237  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
238  kmp_taskdata_t *tied_task;
239 
240  if ( ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser ) {
241  return; // Don't pop anything from stack if team or team tasks are serialized
242  }
243 
244  KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
245  KMP_DEBUG_ASSERT( task_stack -> ts_entries > 0 );
246 
247  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, thread ) );
248 
249  // fix up ts_top if we need to pop from previous block
250  if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
251  {
252  kmp_stack_block_t *stack_block =
253  (kmp_stack_block_t *) (task_stack -> ts_top) ;
254 
255  stack_block = stack_block -> sb_prev;
256  task_stack -> ts_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
257  }
258 
259  // finish bookkeeping
260  task_stack -> ts_top--;
261  task_stack -> ts_entries--;
262 
263  tied_task = * (task_stack -> ts_top );
264 
265  KMP_DEBUG_ASSERT( tied_task != NULL );
266  KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
267  KMP_DEBUG_ASSERT( tied_task == ending_task ); // If we built the stack correctly
268 
269  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
270  return;
271 }
272 #endif /* BUILD_TIED_TASK_STACK */
273 
274 //---------------------------------------------------
275 // __kmp_push_task: Add a task to the thread's deque
276 
277 static kmp_int32
278 __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
279 {
280  kmp_info_t * thread = __kmp_threads[ gtid ];
281  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
282  kmp_task_team_t * task_team = thread->th.th_task_team;
283  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
284  kmp_thread_data_t * thread_data;
285 
286  KA_TRACE(20, ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata ) );
287 
288  // The first check avoids building task_team thread data if serialized
289  if ( taskdata->td_flags.task_serial ) {
290  KA_TRACE(20, ( "__kmp_push_task: T#%d team serialized; returning TASK_NOT_PUSHED for task %p\n",
291  gtid, taskdata ) );
292  return TASK_NOT_PUSHED;
293  }
294 
295  // Now that serialized tasks have returned, we can assume that we are not in immediate exec mode
296  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
297  if ( ! KMP_TASKING_ENABLED(task_team) ) {
298  __kmp_enable_tasking( task_team, thread );
299  }
300  KMP_DEBUG_ASSERT( TCR_4(task_team -> tt.tt_found_tasks) == TRUE );
301  KMP_DEBUG_ASSERT( TCR_PTR(task_team -> tt.tt_threads_data) != NULL );
302 
303  // Find tasking deque specific to encountering thread
304  thread_data = & task_team -> tt.tt_threads_data[ tid ];
305 
306  // No lock needed since only owner can allocate
307  if (thread_data -> td.td_deque == NULL ) {
308  __kmp_alloc_task_deque( thread, thread_data );
309  }
310 
311  // Check if deque is full
312  if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
313  {
314  KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full; returning TASK_NOT_PUSHED for task %p\n",
315  gtid, taskdata ) );
316  return TASK_NOT_PUSHED;
317  }
318 
319  // Lock the deque for the task push operation
320  __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
321 
322 #if OMP_41_ENABLED
323  // Need to recheck as we can get a proxy task from a thread outside of OpenMP
324  if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
325  {
326  __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
327  KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n",
328  gtid, taskdata ) );
329  return TASK_NOT_PUSHED;
330  }
331 #else
332  // Must have room since no thread can add tasks but calling thread
333  KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE );
334 #endif
335 
336  thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata; // Push taskdata
337  // Wrap index.
338  thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
339  TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1); // Adjust task count
340 
341  __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
342 
343  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
344  "task=%p ntasks=%d head=%u tail=%u\n",
345  gtid, taskdata, thread_data->td.td_deque_ntasks,
346  thread_data->td.td_deque_tail, thread_data->td.td_deque_head) );
347 
348  return TASK_SUCCESSFULLY_PUSHED;
349 }
350 
351 
352 //-----------------------------------------------------------------------------------------
353 // __kmp_pop_current_task_from_thread: set up current task from called thread when team ends
354 // this_thr: thread structure to set current_task in.
355 
356 void
357 __kmp_pop_current_task_from_thread( kmp_info_t *this_thr )
358 {
359  KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(enter): T#%d this_thread=%p, curtask=%p, "
360  "curtask_parent=%p\n",
361  0, this_thr, this_thr -> th.th_current_task,
362  this_thr -> th.th_current_task -> td_parent ) );
363 
364  this_thr -> th.th_current_task = this_thr -> th.th_current_task -> td_parent;
365 
366  KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(exit): T#%d this_thread=%p, curtask=%p, "
367  "curtask_parent=%p\n",
368  0, this_thr, this_thr -> th.th_current_task,
369  this_thr -> th.th_current_task -> td_parent ) );
370 }
371 
372 
373 //---------------------------------------------------------------------------------------
374 // __kmp_push_current_task_to_thread: set up current task in called thread for a new team
375 // this_thr: thread structure to set up
376 // team: team for implicit task data
377 // tid: thread within team to set up
378 
379 void
380 __kmp_push_current_task_to_thread( kmp_info_t *this_thr, kmp_team_t *team, int tid )
381 {
382  // current task of the thread is a parent of the new just created implicit tasks of new team
383  KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p curtask=%p "
384  "parent_task=%p\n",
385  tid, this_thr, this_thr->th.th_current_task,
386  team->t.t_implicit_task_taskdata[tid].td_parent ) );
387 
388  KMP_DEBUG_ASSERT (this_thr != NULL);
389 
390  if( tid == 0 ) {
391  if( this_thr->th.th_current_task != & team -> t.t_implicit_task_taskdata[ 0 ] ) {
392  team -> t.t_implicit_task_taskdata[ 0 ].td_parent = this_thr->th.th_current_task;
393  this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ 0 ];
394  }
395  } else {
396  team -> t.t_implicit_task_taskdata[ tid ].td_parent = team -> t.t_implicit_task_taskdata[ 0 ].td_parent;
397  this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ tid ];
398  }
399 
400  KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p curtask=%p "
401  "parent_task=%p\n",
402  tid, this_thr, this_thr->th.th_current_task,
403  team->t.t_implicit_task_taskdata[tid].td_parent ) );
404 }
405 
406 
407 //----------------------------------------------------------------------
408 // __kmp_task_start: bookkeeping for a task starting execution
409 // GTID: global thread id of calling thread
410 // task: task starting execution
411 // current_task: task suspending
412 
413 static void
414 __kmp_task_start( kmp_int32 gtid, kmp_task_t * task, kmp_taskdata_t * current_task )
415 {
416  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
417  kmp_info_t * thread = __kmp_threads[ gtid ];
418 
419  KA_TRACE(10, ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
420  gtid, taskdata, current_task) );
421 
422  KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
423 
424  // mark currently executing task as suspended
425  // TODO: GEH - make sure root team implicit task is initialized properly.
426  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
427  current_task -> td_flags.executing = 0;
428 
429  // Add task to stack if tied
430 #ifdef BUILD_TIED_TASK_STACK
431  if ( taskdata -> td_flags.tiedness == TASK_TIED )
432  {
433  __kmp_push_task_stack( gtid, thread, taskdata );
434  }
435 #endif /* BUILD_TIED_TASK_STACK */
436 
437  // mark starting task as executing and as current task
438  thread -> th.th_current_task = taskdata;
439 
440  KMP_DEBUG_ASSERT( taskdata -> td_flags.started == 0 );
441  KMP_DEBUG_ASSERT( taskdata -> td_flags.executing == 0 );
442  taskdata -> td_flags.started = 1;
443  taskdata -> td_flags.executing = 1;
444  KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
445  KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
446 
447  // GEH TODO: shouldn't we pass some sort of location identifier here?
448  // APT: yes, we will pass location here.
449  // need to store current thread state (in a thread or taskdata structure)
450  // before setting work_state, otherwise wrong state is set after end of task
451 
452  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n",
453  gtid, taskdata ) );
454 
455 #if OMPT_SUPPORT
456  if (ompt_enabled &&
457  ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
458  kmp_taskdata_t *parent = taskdata->td_parent;
459  ompt_callbacks.ompt_callback(ompt_event_task_begin)(
460  parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
461  parent ? &(parent->ompt_task_info.frame) : NULL,
462  taskdata->ompt_task_info.task_id,
463  taskdata->ompt_task_info.function);
464  }
465 #endif
466 
467  return;
468 }
469 
470 
471 //----------------------------------------------------------------------
472 // __kmpc_omp_task_begin_if0: report that a given serialized task has started execution
473 // loc_ref: source location information; points to beginning of task block.
474 // gtid: global thread number.
475 // task: task thunk for the started task.
476 
477 void
478 __kmpc_omp_task_begin_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
479 {
480  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
481  kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
482 
483  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p current_task=%p\n",
484  gtid, loc_ref, taskdata, current_task ) );
485 
486  taskdata -> td_flags.task_serial = 1; // Execute this task immediately, not deferred.
487  __kmp_task_start( gtid, task, current_task );
488 
489  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n",
490  gtid, loc_ref, taskdata ) );
491 
492  return;
493 }
494 
495 #ifdef TASK_UNUSED
496 //----------------------------------------------------------------------
497 // __kmpc_omp_task_begin: report that a given task has started execution
498 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
499 
500 void
501 __kmpc_omp_task_begin( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
502 {
503  kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
504 
505  KA_TRACE(10, ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
506  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task ) );
507 
508  __kmp_task_start( gtid, task, current_task );
509 
510  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n",
511  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
512 
513  return;
514 }
515 #endif // TASK_UNUSED
516 
517 
518 //-------------------------------------------------------------------------------------
519 // __kmp_free_task: free the current task space and the space for shareds
520 // gtid: Global thread ID of calling thread
521 // taskdata: task to free
522 // thread: thread data structure of caller
523 
524 static void
525 __kmp_free_task( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
526 {
527  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n",
528  gtid, taskdata) );
529 
530  // Check to make sure all flags and counters have the correct values
531  KMP_DEBUG_ASSERT( taskdata->td_flags.tasktype == TASK_EXPLICIT );
532  KMP_DEBUG_ASSERT( taskdata->td_flags.executing == 0 );
533  KMP_DEBUG_ASSERT( taskdata->td_flags.complete == 1 );
534  KMP_DEBUG_ASSERT( taskdata->td_flags.freed == 0 );
535  KMP_DEBUG_ASSERT( TCR_4(taskdata->td_allocated_child_tasks) == 0 || taskdata->td_flags.task_serial == 1);
536  KMP_DEBUG_ASSERT( TCR_4(taskdata->td_incomplete_child_tasks) == 0 );
537 
538  taskdata->td_flags.freed = 1;
539  // deallocate the taskdata and shared variable blocks associated with this task
540  #if USE_FAST_MEMORY
541  __kmp_fast_free( thread, taskdata );
542  #else /* ! USE_FAST_MEMORY */
543  __kmp_thread_free( thread, taskdata );
544  #endif
545 
546  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n",
547  gtid, taskdata) );
548 }
549 
550 //-------------------------------------------------------------------------------------
551 // __kmp_free_task_and_ancestors: free the current task and ancestors without children
552 //
553 // gtid: Global thread ID of calling thread
554 // taskdata: task to free
555 // thread: thread data structure of caller
556 
557 static void
558 __kmp_free_task_and_ancestors( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
559 {
560  kmp_int32 children = 0;
561  kmp_int32 team_or_tasking_serialized = taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser;
562 
563  KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
564 
565  if ( !team_or_tasking_serialized ) {
566  children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
567  KMP_DEBUG_ASSERT( children >= 0 );
568  }
569 
570  // Now, go up the ancestor tree to see if any ancestors can now be freed.
571  while ( children == 0 )
572  {
573  kmp_taskdata_t * parent_taskdata = taskdata -> td_parent;
574 
575  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
576  "and freeing itself\n", gtid, taskdata) );
577 
578  // --- Deallocate my ancestor task ---
579  __kmp_free_task( gtid, taskdata, thread );
580 
581  taskdata = parent_taskdata;
582 
583  // Stop checking ancestors at implicit task or if tasking serialized
584  // instead of walking up ancestor tree to avoid premature deallocation of ancestors.
585  if ( team_or_tasking_serialized || taskdata -> td_flags.tasktype == TASK_IMPLICIT )
586  return;
587 
588  if ( !team_or_tasking_serialized ) {
589  // Predecrement simulated by "- 1" calculation
590  children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
591  KMP_DEBUG_ASSERT( children >= 0 );
592  }
593  }
594 
595  KA_TRACE(20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
596  "not freeing it yet\n", gtid, taskdata, children) );
597 }
598 
599 //---------------------------------------------------------------------
600 // __kmp_task_finish: bookkeeping to do when a task finishes execution
601 // gtid: global thread ID for calling thread
602 // task: task to be finished
603 // resumed_task: task to be resumed. (may be NULL if task is serialized)
604 
605 static void
606 __kmp_task_finish( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task )
607 {
608  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
609  kmp_info_t * thread = __kmp_threads[ gtid ];
610  kmp_int32 children = 0;
611 
612 #if OMPT_SUPPORT
613  if (ompt_enabled &&
614  ompt_callbacks.ompt_callback(ompt_event_task_end)) {
615  kmp_taskdata_t *parent = taskdata->td_parent;
616  ompt_callbacks.ompt_callback(ompt_event_task_end)(
617  taskdata->ompt_task_info.task_id);
618  }
619 #endif
620 
621  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming task %p\n",
622  gtid, taskdata, resumed_task) );
623 
624  KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
625 
626  // Pop task from stack if tied
627 #ifdef BUILD_TIED_TASK_STACK
628  if ( taskdata -> td_flags.tiedness == TASK_TIED )
629  {
630  __kmp_pop_task_stack( gtid, thread, taskdata );
631  }
632 #endif /* BUILD_TIED_TASK_STACK */
633 
634  KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
635  taskdata -> td_flags.complete = 1; // mark the task as completed
636  KMP_DEBUG_ASSERT( taskdata -> td_flags.started == 1 );
637  KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
638 
639  // Only need to keep track of count if team parallel and tasking not serialized
640  if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) {
641  // Predecrement simulated by "- 1" calculation
642  children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
643  KMP_DEBUG_ASSERT( children >= 0 );
644 #if OMP_40_ENABLED
645  if ( taskdata->td_taskgroup )
646  KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
647  __kmp_release_deps(gtid,taskdata);
648 #endif
649  }
650 
651  // td_flags.executing must be marked as 0 after __kmp_release_deps has been called
652  // Othertwise, if a task is executed immediately from the release_deps code
653  // the flag will be reset to 1 again by this same function
654  KMP_DEBUG_ASSERT( taskdata -> td_flags.executing == 1 );
655  taskdata -> td_flags.executing = 0; // suspend the finishing task
656 
657  KA_TRACE(20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
658  gtid, taskdata, children) );
659 
660 #if OMP_40_ENABLED
661  /* If the tasks' destructor thunk flag has been set, we need to invoke the
662  destructor thunk that has been generated by the compiler.
663  The code is placed here, since at this point other tasks might have been released
664  hence overlapping the destructor invokations with some other work in the
665  released tasks. The OpenMP spec is not specific on when the destructors are
666  invoked, so we should be free to choose.
667  */
668  if (taskdata->td_flags.destructors_thunk) {
669  kmp_routine_entry_t destr_thunk = task->destructors;
670  KMP_ASSERT(destr_thunk);
671  destr_thunk(gtid, task);
672  }
673 #endif // OMP_40_ENABLED
674 
675  // bookkeeping for resuming task:
676  // GEH - note tasking_ser => task_serial
677  KMP_DEBUG_ASSERT( (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
678  taskdata->td_flags.task_serial);
679  if ( taskdata->td_flags.task_serial )
680  {
681  if (resumed_task == NULL) {
682  resumed_task = taskdata->td_parent; // In a serialized task, the resumed task is the parent
683  }
684  else {
685  // verify resumed task passed in points to parent
686  KMP_DEBUG_ASSERT( resumed_task == taskdata->td_parent );
687  }
688  }
689  else {
690  KMP_DEBUG_ASSERT( resumed_task != NULL ); // verify that resumed task is passed as arguemnt
691  }
692 
693  // Free this task and then ancestor tasks if they have no children.
694  // Restore th_current_task first as suggested by John:
695  // johnmc: if an asynchronous inquiry peers into the runtime system
696  // it doesn't see the freed task as the current task.
697  thread->th.th_current_task = resumed_task;
698  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
699 
700  // TODO: GEH - make sure root team implicit task is initialized properly.
701  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
702  resumed_task->td_flags.executing = 1; // resume previous task
703 
704  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
705  gtid, taskdata, resumed_task) );
706 
707  return;
708 }
709 
710 //---------------------------------------------------------------------
711 // __kmpc_omp_task_complete_if0: report that a task has completed execution
712 // loc_ref: source location information; points to end of task block.
713 // gtid: global thread number.
714 // task: task thunk for the completed task.
715 
716 void
717 __kmpc_omp_task_complete_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
718 {
719  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
720  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
721 
722  __kmp_task_finish( gtid, task, NULL ); // this routine will provide task to resume
723 
724  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
725  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
726 
727  return;
728 }
729 
730 #ifdef TASK_UNUSED
731 //---------------------------------------------------------------------
732 // __kmpc_omp_task_complete: report that a task has completed execution
733 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
734 
735 void
736 __kmpc_omp_task_complete( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
737 {
738  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n",
739  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
740 
741  __kmp_task_finish( gtid, task, NULL ); // Not sure how to find task to resume
742 
743  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n",
744  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
745  return;
746 }
747 #endif // TASK_UNUSED
748 
749 
750 #if OMPT_SUPPORT
751 //----------------------------------------------------------------------------------------------------
752 // __kmp_task_init_ompt:
753 // Initialize OMPT fields maintained by a task. This will only be called after
754 // ompt_tool, so we already know whether ompt is enabled or not.
755 
756 static inline void
757 __kmp_task_init_ompt( kmp_taskdata_t * task, int tid, void * function )
758 {
759  if (ompt_enabled) {
760  task->ompt_task_info.task_id = __ompt_task_id_new(tid);
761  task->ompt_task_info.function = function;
762  task->ompt_task_info.frame.exit_runtime_frame = NULL;
763  task->ompt_task_info.frame.reenter_runtime_frame = NULL;
764  }
765 }
766 #endif
767 
768 
769 //----------------------------------------------------------------------------------------------------
770 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit task for a given thread
771 //
772 // loc_ref: reference to source location of parallel region
773 // this_thr: thread data structure corresponding to implicit task
774 // team: team for this_thr
775 // tid: thread id of given thread within team
776 // set_curr_task: TRUE if need to push current task to thread
777 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to have already been done elsewhere.
778 // TODO: Get better loc_ref. Value passed in may be NULL
779 
780 void
781 __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task )
782 {
783  kmp_taskdata_t * task = & team->t.t_implicit_task_taskdata[ tid ];
784 
785  KF_TRACE(10, ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
786  tid, team, task, set_curr_task ? "TRUE" : "FALSE" ) );
787 
788  task->td_task_id = KMP_GEN_TASK_ID();
789  task->td_team = team;
790 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info in debugger)
791  task->td_ident = loc_ref;
792  task->td_taskwait_ident = NULL;
793  task->td_taskwait_counter = 0;
794  task->td_taskwait_thread = 0;
795 
796  task->td_flags.tiedness = TASK_TIED;
797  task->td_flags.tasktype = TASK_IMPLICIT;
798 #if OMP_41_ENABLED
799  task->td_flags.proxy = TASK_FULL;
800 #endif
801 
802  // All implicit tasks are executed immediately, not deferred
803  task->td_flags.task_serial = 1;
804  task->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
805  task->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
806 
807  task->td_flags.started = 1;
808  task->td_flags.executing = 1;
809  task->td_flags.complete = 0;
810  task->td_flags.freed = 0;
811 
812 #if OMP_40_ENABLED
813  task->td_dephash = NULL;
814  task->td_depnode = NULL;
815 #endif
816 
817  if (set_curr_task) { // only do this initialization the first time a thread is created
818  task->td_incomplete_child_tasks = 0;
819  task->td_allocated_child_tasks = 0; // Not used because do not need to deallocate implicit task
820 #if OMP_40_ENABLED
821  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
822 #endif
823  __kmp_push_current_task_to_thread( this_thr, team, tid );
824  } else {
825  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
826  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
827  }
828 
829 #if OMPT_SUPPORT
830  __kmp_task_init_ompt(task, tid, NULL);
831 #endif
832 
833  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n",
834  tid, team, task ) );
835 }
836 
837 // Round up a size to a power of two specified by val
838 // Used to insert padding between structures co-allocated using a single malloc() call
839 static size_t
840 __kmp_round_up_to_val( size_t size, size_t val ) {
841  if ( size & ( val - 1 ) ) {
842  size &= ~ ( val - 1 );
843  if ( size <= KMP_SIZE_T_MAX - val ) {
844  size += val; // Round up if there is no overflow.
845  }; // if
846  }; // if
847  return size;
848 } // __kmp_round_up_to_va
849 
850 
851 //---------------------------------------------------------------------------------
852 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
853 //
854 // loc_ref: source location information
855 // gtid: global thread number.
856 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' task encountered.
857 // Converted from kmp_int32 to kmp_tasking_flags_t in routine.
858 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including private vars accessed in task.
859 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed in task.
860 // task_entry: Pointer to task code entry point generated by compiler.
861 // returns: a pointer to the allocated kmp_task_t structure (task).
862 
863 kmp_task_t *
864 __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
865  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
866  kmp_routine_entry_t task_entry )
867 {
868  kmp_task_t *task;
869  kmp_taskdata_t *taskdata;
870  kmp_info_t *thread = __kmp_threads[ gtid ];
871  kmp_team_t *team = thread->th.th_team;
872  kmp_taskdata_t *parent_task = thread->th.th_current_task;
873  size_t shareds_offset;
874 
875  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
876  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
877  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
878  sizeof_shareds, task_entry) );
879 
880  if ( parent_task->td_flags.final ) {
881  if (flags->merged_if0) {
882  }
883  flags->final = 1;
884  }
885 
886 #if OMP_41_ENABLED
887  if ( flags->proxy == TASK_PROXY ) {
888  flags->tiedness = TASK_UNTIED;
889  flags->merged_if0 = 1;
890 
891  /* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */
892  if ( (thread->th.th_task_team) == NULL ) {
893  /* This should only happen if the team is serialized
894  setup a task team and propagate it to the thread
895  */
896  KMP_DEBUG_ASSERT(team->t.t_serialized);
897  KA_TRACE(30,("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid));
898  __kmp_task_team_setup(thread,team,1); // 1 indicates setup the current team regardless of nthreads
899  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
900  }
901  kmp_task_team_t * task_team = thread->th.th_task_team;
902 
903  /* tasking must be enabled now as the task might not be pushed */
904  if ( !KMP_TASKING_ENABLED( task_team ) ) {
905  KA_TRACE(30,("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
906  __kmp_enable_tasking( task_team, thread );
907  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
908  kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
909  // No lock needed since only owner can allocate
910  if (thread_data -> td.td_deque == NULL ) {
911  __kmp_alloc_task_deque( thread, thread_data );
912  }
913  }
914 
915  if ( task_team->tt.tt_found_proxy_tasks == FALSE )
916  TCW_4(task_team -> tt.tt_found_proxy_tasks, TRUE);
917  }
918 #endif
919 
920  // Calculate shared structure offset including padding after kmp_task_t struct
921  // to align pointers in shared struct
922  shareds_offset = sizeof( kmp_taskdata_t ) + sizeof_kmp_task_t;
923  shareds_offset = __kmp_round_up_to_val( shareds_offset, sizeof( void * ));
924 
925  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
926  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n",
927  gtid, shareds_offset) );
928  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n",
929  gtid, sizeof_shareds) );
930 
931  // Avoid double allocation here by combining shareds with taskdata
932  #if USE_FAST_MEMORY
933  taskdata = (kmp_taskdata_t *) __kmp_fast_allocate( thread, shareds_offset + sizeof_shareds );
934  #else /* ! USE_FAST_MEMORY */
935  taskdata = (kmp_taskdata_t *) __kmp_thread_malloc( thread, shareds_offset + sizeof_shareds );
936  #endif /* USE_FAST_MEMORY */
937 
938  task = KMP_TASKDATA_TO_TASK(taskdata);
939 
940  // Make sure task & taskdata are aligned appropriately
941 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
942  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(double)-1) ) == 0 );
943  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(double)-1) ) == 0 );
944 #else
945  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(_Quad)-1) ) == 0 );
946  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(_Quad)-1) ) == 0 );
947 #endif
948  if (sizeof_shareds > 0) {
949  // Avoid double allocation here by combining shareds with taskdata
950  task->shareds = & ((char *) taskdata)[ shareds_offset ];
951  // Make sure shareds struct is aligned to pointer size
952  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task->shareds) & (sizeof(void *)-1) ) == 0 );
953  } else {
954  task->shareds = NULL;
955  }
956  task->routine = task_entry;
957  task->part_id = 0; // AC: Always start with 0 part id
958 
959  taskdata->td_task_id = KMP_GEN_TASK_ID();
960  taskdata->td_team = team;
961  taskdata->td_alloc_thread = thread;
962  taskdata->td_parent = parent_task;
963  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
964  taskdata->td_ident = loc_ref;
965  taskdata->td_taskwait_ident = NULL;
966  taskdata->td_taskwait_counter = 0;
967  taskdata->td_taskwait_thread = 0;
968  KMP_DEBUG_ASSERT( taskdata->td_parent != NULL );
969 #if OMP_41_ENABLED
970  // avoid copying icvs for proxy tasks
971  if ( flags->proxy == TASK_FULL )
972 #endif
973  copy_icvs( &taskdata->td_icvs, &taskdata->td_parent->td_icvs );
974 
975  taskdata->td_flags.tiedness = flags->tiedness;
976  taskdata->td_flags.final = flags->final;
977  taskdata->td_flags.merged_if0 = flags->merged_if0;
978 #if OMP_40_ENABLED
979  taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
980 #endif // OMP_40_ENABLED
981 #if OMP_41_ENABLED
982  taskdata->td_flags.proxy = flags->proxy;
983 #endif
984  taskdata->td_flags.tasktype = TASK_EXPLICIT;
985 
986  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
987  taskdata->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
988 
989  // GEH - TODO: fix this to copy parent task's value of team_serial flag
990  taskdata->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
991 
992  // GEH - Note we serialize the task if the team is serialized to make sure implicit parallel region
993  // tasks are not left until program termination to execute. Also, it helps locality to execute
994  // immediately.
995  taskdata->td_flags.task_serial = ( parent_task->td_flags.final
996  || taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser );
997 
998  taskdata->td_flags.started = 0;
999  taskdata->td_flags.executing = 0;
1000  taskdata->td_flags.complete = 0;
1001  taskdata->td_flags.freed = 0;
1002 
1003  taskdata->td_flags.native = flags->native;
1004 
1005  taskdata->td_incomplete_child_tasks = 0;
1006  taskdata->td_allocated_child_tasks = 1; // start at one because counts current task and children
1007 #if OMP_40_ENABLED
1008  taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
1009  taskdata->td_dephash = NULL;
1010  taskdata->td_depnode = NULL;
1011 #endif
1012 
1013  // Only need to keep track of child task counts if team parallel and tasking not serialized or if it is a proxy task
1014 #if OMP_41_ENABLED
1015  if ( flags->proxy == TASK_PROXY || !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
1016 #else
1017  if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
1018 #endif
1019  {
1020  KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
1021 #if OMP_40_ENABLED
1022  if ( parent_task->td_taskgroup )
1023  KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
1024 #endif
1025  // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
1026  if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT ) {
1027  KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
1028  }
1029  }
1030 
1031  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1032  gtid, taskdata, taskdata->td_parent) );
1033 
1034 #if OMPT_SUPPORT
1035  __kmp_task_init_ompt(taskdata, gtid, (void*) task_entry);
1036 #endif
1037 
1038  return task;
1039 }
1040 
1041 
1042 kmp_task_t *
1043 __kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
1044  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1045  kmp_routine_entry_t task_entry )
1046 {
1047  kmp_task_t *retval;
1048  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *) & flags;
1049 
1050  input_flags->native = FALSE;
1051  // __kmp_task_alloc() sets up all other runtime flags
1052 
1053 #if OMP_41_ENABLED
1054  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1055  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1056  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1057  input_flags->proxy ? "proxy" : "",
1058  sizeof_kmp_task_t, sizeof_shareds, task_entry) );
1059 #else
1060  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1061  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1062  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1063  sizeof_kmp_task_t, sizeof_shareds, task_entry) );
1064 #endif
1065 
1066  retval = __kmp_task_alloc( loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1067  sizeof_shareds, task_entry );
1068 
1069  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval) );
1070 
1071  return retval;
1072 }
1073 
1074 //-----------------------------------------------------------
1075 // __kmp_invoke_task: invoke the specified task
1076 //
1077 // gtid: global thread ID of caller
1078 // task: the task to invoke
1079 // current_task: the task to resume after task invokation
1080 
1081 static void
1082 __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_task )
1083 {
1084  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
1085 #if OMP_40_ENABLED
1086  int discard = 0 /* false */;
1087 #endif
1088  KA_TRACE(30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1089  gtid, taskdata, current_task) );
1090  KMP_DEBUG_ASSERT(task);
1091 #if OMP_41_ENABLED
1092  if ( taskdata->td_flags.proxy == TASK_PROXY &&
1093  taskdata->td_flags.complete == 1)
1094  {
1095  // This is a proxy task that was already completed but it needs to run
1096  // its bottom-half finish
1097  KA_TRACE(30, ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1098  gtid, taskdata) );
1099 
1100  __kmp_bottom_half_finish_proxy(gtid,task);
1101 
1102  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for proxy task %p, resuming task %p\n", gtid, taskdata, current_task) );
1103 
1104  return;
1105  }
1106 #endif
1107 
1108 #if OMP_41_ENABLED
1109  // Proxy tasks are not handled by the runtime
1110  if ( taskdata->td_flags.proxy != TASK_PROXY )
1111 #endif
1112  __kmp_task_start( gtid, task, current_task );
1113 
1114 #if OMPT_SUPPORT
1115  ompt_thread_info_t oldInfo;
1116  kmp_info_t * thread;
1117  if (ompt_enabled) {
1118  // Store the threads states and restore them after the task
1119  thread = __kmp_threads[ gtid ];
1120  oldInfo = thread->th.ompt_thread_info;
1121  thread->th.ompt_thread_info.wait_id = 0;
1122  thread->th.ompt_thread_info.state = ompt_state_work_parallel;
1123  taskdata->ompt_task_info.frame.exit_runtime_frame = __builtin_frame_address(0);
1124  }
1125 #endif
1126 
1127 #if OMP_40_ENABLED
1128  // TODO: cancel tasks if the parallel region has also been cancelled
1129  // TODO: check if this sequence can be hoisted above __kmp_task_start
1130  // if cancellation has been enabled for this run ...
1131  if (__kmp_omp_cancellation) {
1132  kmp_info_t *this_thr = __kmp_threads [ gtid ];
1133  kmp_team_t * this_team = this_thr->th.th_team;
1134  kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
1135  if ((taskgroup && taskgroup->cancel_request) || (this_team->t.t_cancel_request == cancel_parallel)) {
1136  KMP_COUNT_BLOCK(TASK_cancelled);
1137  // this task belongs to a task group and we need to cancel it
1138  discard = 1 /* true */;
1139  }
1140  }
1141 
1142  //
1143  // Invoke the task routine and pass in relevant data.
1144  // Thunks generated by gcc take a different argument list.
1145  //
1146  if (!discard) {
1147  KMP_COUNT_BLOCK(TASK_executed);
1148  KMP_TIME_BLOCK (TASK_execution);
1149 #endif // OMP_40_ENABLED
1150 
1151 #if OMPT_SUPPORT && OMPT_TRACE
1152  /* let OMPT know that we're about to run this task */
1153  if (ompt_enabled &&
1154  ompt_callbacks.ompt_callback(ompt_event_task_switch))
1155  {
1156  ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1157  current_task->ompt_task_info.task_id,
1158  taskdata->ompt_task_info.task_id);
1159  }
1160 #endif
1161 
1162 #ifdef KMP_GOMP_COMPAT
1163  if (taskdata->td_flags.native) {
1164  ((void (*)(void *))(*(task->routine)))(task->shareds);
1165  }
1166  else
1167 #endif /* KMP_GOMP_COMPAT */
1168  {
1169  (*(task->routine))(gtid, task);
1170  }
1171 
1172 #if OMPT_SUPPORT && OMPT_TRACE
1173  /* let OMPT know that we're returning to the callee task */
1174  if (ompt_enabled &&
1175  ompt_callbacks.ompt_callback(ompt_event_task_switch))
1176  {
1177  ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1178  taskdata->ompt_task_info.task_id,
1179  current_task->ompt_task_info.task_id);
1180  }
1181 #endif
1182 
1183 #if OMP_40_ENABLED
1184  }
1185 #endif // OMP_40_ENABLED
1186 
1187 
1188 #if OMPT_SUPPORT
1189  if (ompt_enabled) {
1190  thread->th.ompt_thread_info = oldInfo;
1191  taskdata->ompt_task_info.frame.exit_runtime_frame = 0;
1192  }
1193 #endif
1194 
1195 #if OMP_41_ENABLED
1196  // Proxy tasks are not handled by the runtime
1197  if ( taskdata->td_flags.proxy != TASK_PROXY )
1198 #endif
1199  __kmp_task_finish( gtid, task, current_task );
1200 
1201  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1202  gtid, taskdata, current_task) );
1203  return;
1204 }
1205 
1206 //-----------------------------------------------------------------------
1207 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1208 //
1209 // loc_ref: location of original task pragma (ignored)
1210 // gtid: Global Thread ID of encountering thread
1211 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1212 // Returns:
1213 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1214 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1215 
1216 kmp_int32
1217 __kmpc_omp_task_parts( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
1218 {
1219  kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1220 
1221  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n",
1222  gtid, loc_ref, new_taskdata ) );
1223 
1224  /* Should we execute the new task or queue it? For now, let's just always try to
1225  queue it. If the queue fills up, then we'll execute it. */
1226 
1227  if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1228  { // Execute this task immediately
1229  kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
1230  new_taskdata->td_flags.task_serial = 1;
1231  __kmp_invoke_task( gtid, new_task, current_task );
1232  }
1233 
1234  KA_TRACE(10, ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1235  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", gtid, loc_ref,
1236  new_taskdata ) );
1237 
1238  return TASK_CURRENT_NOT_QUEUED;
1239 }
1240 
1241 //---------------------------------------------------------------------
1242 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1243 // gtid: Global Thread ID of encountering thread
1244 // new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1245 // serialize_immediate: if TRUE then if the task is executed immediately its execution will be serialized
1246 // returns:
1247 //
1248 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1249 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1250 kmp_int32
1251 __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate )
1252 {
1253  kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1254 
1255 #if OMPT_SUPPORT
1256  if (ompt_enabled) {
1257  new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
1258  __builtin_frame_address(0);
1259  }
1260 #endif
1261 
1262  /* Should we execute the new task or queue it? For now, let's just always try to
1263  queue it. If the queue fills up, then we'll execute it. */
1264 #if OMP_41_ENABLED
1265  if ( new_taskdata->td_flags.proxy == TASK_PROXY || __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1266 #else
1267  if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1268 #endif
1269  { // Execute this task immediately
1270  kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
1271  if ( serialize_immediate )
1272  new_taskdata -> td_flags.task_serial = 1;
1273  __kmp_invoke_task( gtid, new_task, current_task );
1274  }
1275 
1276 #if OMPT_SUPPORT
1277  if (ompt_enabled) {
1278  new_taskdata->ompt_task_info.frame.reenter_runtime_frame = 0;
1279  }
1280 #endif
1281 
1282  return TASK_CURRENT_NOT_QUEUED;
1283 }
1284 
1285 //---------------------------------------------------------------------
1286 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a non-thread-switchable task from
1287 // the parent thread only!
1288 // loc_ref: location of original task pragma (ignored)
1289 // gtid: Global Thread ID of encountering thread
1290 // new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1291 // returns:
1292 //
1293 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1294 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1295 
1296 kmp_int32
1297 __kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
1298 {
1299  kmp_int32 res;
1300 
1301 #if KMP_DEBUG
1302  kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1303 #endif
1304  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n",
1305  gtid, loc_ref, new_taskdata ) );
1306 
1307  res = __kmp_omp_task(gtid,new_task,true);
1308 
1309  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1310  gtid, loc_ref, new_taskdata ) );
1311  return res;
1312 }
1313 
1314 //-------------------------------------------------------------------------------------
1315 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are complete
1316 
1317 kmp_int32
1318 __kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid )
1319 {
1320  kmp_taskdata_t * taskdata;
1321  kmp_info_t * thread;
1322  int thread_finished = FALSE;
1323 
1324  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref) );
1325 
1326  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1327  // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
1328 
1329  thread = __kmp_threads[ gtid ];
1330  taskdata = thread -> th.th_current_task;
1331 
1332 #if OMPT_SUPPORT && OMPT_TRACE
1333  ompt_task_id_t my_task_id;
1334  ompt_parallel_id_t my_parallel_id;
1335 
1336  if (ompt_enabled) {
1337  kmp_team_t *team = thread->th.th_team;
1338  my_task_id = taskdata->ompt_task_info.task_id;
1339  my_parallel_id = team->t.ompt_team_info.parallel_id;
1340 
1341  if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) {
1342  ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(
1343  my_parallel_id, my_task_id);
1344  }
1345  }
1346 #endif
1347 
1348 #if USE_ITT_BUILD
1349  // Note: These values are used by ITT events as well.
1350 #endif /* USE_ITT_BUILD */
1351  taskdata->td_taskwait_counter += 1;
1352  taskdata->td_taskwait_ident = loc_ref;
1353  taskdata->td_taskwait_thread = gtid + 1;
1354 
1355 #if USE_ITT_BUILD
1356  void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1357  if ( itt_sync_obj != NULL )
1358  __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1359 #endif /* USE_ITT_BUILD */
1360 
1361 #if OMP_41_ENABLED
1362  if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) )
1363 #else
1364  if ( ! taskdata->td_flags.team_serial )
1365 #endif
1366  {
1367  // GEH: if team serialized, avoid reading the volatile variable below.
1368  kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
1369  while ( TCR_4(taskdata -> td_incomplete_child_tasks) != 0 ) {
1370  flag.execute_tasks(thread, gtid, FALSE, &thread_finished
1371  USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1372  }
1373  }
1374 #if USE_ITT_BUILD
1375  if ( itt_sync_obj != NULL )
1376  __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1377 #endif /* USE_ITT_BUILD */
1378 
1379  // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
1380  taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
1381 
1382 #if OMPT_SUPPORT && OMPT_TRACE
1383  if (ompt_enabled &&
1384  ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) {
1385  ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(
1386  my_parallel_id, my_task_id);
1387  }
1388 #endif
1389  }
1390 
1391  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1392  "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
1393 
1394  return TASK_CURRENT_NOT_QUEUED;
1395 }
1396 
1397 
1398 //-------------------------------------------------
1399 // __kmpc_omp_taskyield: switch to a different task
1400 
1401 kmp_int32
1402 __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
1403 {
1404  kmp_taskdata_t * taskdata;
1405  kmp_info_t * thread;
1406  int thread_finished = FALSE;
1407 
1408  KMP_COUNT_BLOCK(OMP_TASKYIELD);
1409 
1410  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1411  gtid, loc_ref, end_part) );
1412 
1413  if ( __kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel ) {
1414  // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
1415 
1416  thread = __kmp_threads[ gtid ];
1417  taskdata = thread -> th.th_current_task;
1418  // Should we model this as a task wait or not?
1419 #if USE_ITT_BUILD
1420  // Note: These values are used by ITT events as well.
1421 #endif /* USE_ITT_BUILD */
1422  taskdata->td_taskwait_counter += 1;
1423  taskdata->td_taskwait_ident = loc_ref;
1424  taskdata->td_taskwait_thread = gtid + 1;
1425 
1426 #if USE_ITT_BUILD
1427  void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1428  if ( itt_sync_obj != NULL )
1429  __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1430 #endif /* USE_ITT_BUILD */
1431  if ( ! taskdata->td_flags.team_serial ) {
1432  kmp_task_team_t * task_team = thread->th.th_task_team;
1433  if (task_team != NULL) {
1434  if (KMP_TASKING_ENABLED(task_team)) {
1435  __kmp_execute_tasks_32( thread, gtid, NULL, FALSE, &thread_finished
1436  USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1437  }
1438  }
1439  }
1440 #if USE_ITT_BUILD
1441  if ( itt_sync_obj != NULL )
1442  __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1443 #endif /* USE_ITT_BUILD */
1444 
1445  // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
1446  taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
1447  }
1448 
1449  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1450  "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
1451 
1452  return TASK_CURRENT_NOT_QUEUED;
1453 }
1454 
1455 
1456 #if OMP_40_ENABLED
1457 //-------------------------------------------------------------------------------------
1458 // __kmpc_taskgroup: Start a new taskgroup
1459 
1460 void
1461 __kmpc_taskgroup( ident_t* loc, int gtid )
1462 {
1463  kmp_info_t * thread = __kmp_threads[ gtid ];
1464  kmp_taskdata_t * taskdata = thread->th.th_current_task;
1465  kmp_taskgroup_t * tg_new =
1466  (kmp_taskgroup_t *)__kmp_thread_malloc( thread, sizeof( kmp_taskgroup_t ) );
1467  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new) );
1468  tg_new->count = 0;
1469  tg_new->cancel_request = cancel_noreq;
1470  tg_new->parent = taskdata->td_taskgroup;
1471  taskdata->td_taskgroup = tg_new;
1472 }
1473 
1474 
1475 //-------------------------------------------------------------------------------------
1476 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
1477 // and its descendants are complete
1478 
1479 void
1480 __kmpc_end_taskgroup( ident_t* loc, int gtid )
1481 {
1482  kmp_info_t * thread = __kmp_threads[ gtid ];
1483  kmp_taskdata_t * taskdata = thread->th.th_current_task;
1484  kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
1485  int thread_finished = FALSE;
1486 
1487  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc) );
1488  KMP_DEBUG_ASSERT( taskgroup != NULL );
1489 
1490  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1491 #if USE_ITT_BUILD
1492  // For ITT the taskgroup wait is similar to taskwait until we need to distinguish them
1493  void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1494  if ( itt_sync_obj != NULL )
1495  __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1496 #endif /* USE_ITT_BUILD */
1497 
1498 #if OMP_41_ENABLED
1499  if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) )
1500 #else
1501  if ( ! taskdata->td_flags.team_serial )
1502 #endif
1503  {
1504  kmp_flag_32 flag(&(taskgroup->count), 0U);
1505  while ( TCR_4(taskgroup->count) != 0 ) {
1506  flag.execute_tasks(thread, gtid, FALSE, &thread_finished
1507  USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1508  }
1509  }
1510 
1511 #if USE_ITT_BUILD
1512  if ( itt_sync_obj != NULL )
1513  __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1514 #endif /* USE_ITT_BUILD */
1515  }
1516  KMP_DEBUG_ASSERT( taskgroup->count == 0 );
1517 
1518  // Restore parent taskgroup for the current task
1519  taskdata->td_taskgroup = taskgroup->parent;
1520  __kmp_thread_free( thread, taskgroup );
1521 
1522  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", gtid, taskdata) );
1523 }
1524 #endif
1525 
1526 
1527 //------------------------------------------------------
1528 // __kmp_remove_my_task: remove a task from my own deque
1529 
1530 static kmp_task_t *
1531 __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 gtid, kmp_task_team_t *task_team,
1532  kmp_int32 is_constrained )
1533 {
1534  kmp_task_t * task;
1535  kmp_taskdata_t * taskdata;
1536  kmp_thread_data_t *thread_data;
1537  kmp_uint32 tail;
1538 
1539  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1540  KMP_DEBUG_ASSERT( task_team -> tt.tt_threads_data != NULL ); // Caller should check this condition
1541 
1542  thread_data = & task_team -> tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
1543 
1544  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
1545  gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1546  thread_data->td.td_deque_tail) );
1547 
1548  if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
1549  KA_TRACE(10, ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1550  gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1551  thread_data->td.td_deque_tail) );
1552  return NULL;
1553  }
1554 
1555  __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
1556 
1557  if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
1558  __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
1559  KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1560  gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1561  thread_data->td.td_deque_tail) );
1562  return NULL;
1563  }
1564 
1565  tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK; // Wrap index.
1566  taskdata = thread_data -> td.td_deque[ tail ];
1567 
1568  if (is_constrained) {
1569  // we need to check if the candidate obeys task scheduling constraint:
1570  // only child of current task can be scheduled
1571  kmp_taskdata_t * current = thread->th.th_current_task;
1572  kmp_int32 level = current->td_level;
1573  kmp_taskdata_t * parent = taskdata->td_parent;
1574  while ( parent != current && parent->td_level > level ) {
1575  parent = parent->td_parent; // check generation up to the level of the current task
1576  KMP_DEBUG_ASSERT(parent != NULL);
1577  }
1578  if ( parent != current ) {
1579  // If the tail task is not a child, then no other childs can appear in the deque.
1580  __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
1581  KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1582  gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1583  thread_data->td.td_deque_tail) );
1584  return NULL;
1585  }
1586  }
1587 
1588  thread_data -> td.td_deque_tail = tail;
1589  TCW_4(thread_data -> td.td_deque_ntasks, thread_data -> td.td_deque_ntasks - 1);
1590 
1591  __kmp_release_bootstrap_lock( & thread_data->td.td_deque_lock );
1592 
1593  KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: ntasks=%d head=%u tail=%u\n",
1594  gtid, taskdata, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1595  thread_data->td.td_deque_tail) );
1596 
1597  task = KMP_TASKDATA_TO_TASK( taskdata );
1598  return task;
1599 }
1600 
1601 
1602 //-----------------------------------------------------------
1603 // __kmp_steal_task: remove a task from another thread's deque
1604 // Assume that calling thread has already checked existence of
1605 // task_team thread_data before calling this routine.
1606 
1607 static kmp_task_t *
1608 __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team,
1609  volatile kmp_uint32 *unfinished_threads, int *thread_finished,
1610  kmp_int32 is_constrained )
1611 {
1612  kmp_task_t * task;
1613  kmp_taskdata_t * taskdata;
1614  kmp_thread_data_t *victim_td, *threads_data;
1615  kmp_int32 victim_tid;
1616 
1617  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1618 
1619  threads_data = task_team -> tt.tt_threads_data;
1620  KMP_DEBUG_ASSERT( threads_data != NULL ); // Caller should check this condition
1621 
1622  victim_tid = victim->th.th_info.ds.ds_tid;
1623  victim_td = & threads_data[ victim_tid ];
1624 
1625  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: task_team=%p ntasks=%d "
1626  "head=%u tail=%u\n",
1627  gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1628  victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1629 
1630  if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) || // Caller should not check this condition
1631  (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
1632  {
1633  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: task_team=%p "
1634  "ntasks=%d head=%u tail=%u\n",
1635  gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1636  victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1637  return NULL;
1638  }
1639 
1640  __kmp_acquire_bootstrap_lock( & victim_td -> td.td_deque_lock );
1641 
1642  // Check again after we acquire the lock
1643  if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) ||
1644  (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
1645  {
1646  __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
1647  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
1648  "ntasks=%d head=%u tail=%u\n",
1649  gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1650  victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1651  return NULL;
1652  }
1653 
1654  KMP_DEBUG_ASSERT( victim_td -> td.td_deque != NULL );
1655 
1656  if ( !is_constrained ) {
1657  taskdata = victim_td -> td.td_deque[ victim_td -> td.td_deque_head ];
1658  // Bump head pointer and Wrap.
1659  victim_td -> td.td_deque_head = ( victim_td -> td.td_deque_head + 1 ) & TASK_DEQUE_MASK;
1660  } else {
1661  // While we have postponed tasks let's steal from tail of the deque (smaller tasks)
1662  kmp_int32 tail = ( victim_td -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK; // Wrap index.
1663  taskdata = victim_td -> td.td_deque[ tail ];
1664  // we need to check if the candidate obeys task scheduling constraint:
1665  // only child of current task can be scheduled
1666  kmp_taskdata_t * current = __kmp_threads[ gtid ]->th.th_current_task;
1667  kmp_int32 level = current->td_level;
1668  kmp_taskdata_t * parent = taskdata->td_parent;
1669  while ( parent != current && parent->td_level > level ) {
1670  parent = parent->td_parent; // check generation up to the level of the current task
1671  KMP_DEBUG_ASSERT(parent != NULL);
1672  }
1673  if ( parent != current ) {
1674  // If the tail task is not a child, then no other childs can appear in the deque (?).
1675  __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
1676  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
1677  "ntasks=%d head=%u tail=%u\n",
1678  gtid, __kmp_gtid_from_thread( threads_data[victim_tid].td.td_thr ),
1679  task_team, victim_td->td.td_deque_ntasks,
1680  victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1681  return NULL;
1682  }
1683  victim_td -> td.td_deque_tail = tail;
1684  }
1685  if (*thread_finished) {
1686  // We need to un-mark this victim as a finished victim. This must be done before
1687  // releasing the lock, or else other threads (starting with the master victim)
1688  // might be prematurely released from the barrier!!!
1689  kmp_uint32 count;
1690 
1691  count = KMP_TEST_THEN_INC32( (kmp_int32 *)unfinished_threads );
1692 
1693  KA_TRACE(20, ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
1694  gtid, count + 1, task_team) );
1695 
1696  *thread_finished = FALSE;
1697  }
1698  TCW_4(victim_td -> td.td_deque_ntasks, TCR_4(victim_td -> td.td_deque_ntasks) - 1);
1699 
1700  __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
1701 
1702  KMP_COUNT_BLOCK(TASK_stolen);
1703  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
1704  "ntasks=%d head=%u tail=%u\n",
1705  gtid, taskdata, __kmp_gtid_from_thread( victim ), task_team,
1706  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1707  victim_td->td.td_deque_tail) );
1708 
1709  task = KMP_TASKDATA_TO_TASK( taskdata );
1710  return task;
1711 }
1712 
1713 
1714 //-----------------------------------------------------------------------------
1715 // __kmp_execute_tasks_template: Choose and execute tasks until either the condition
1716 // is statisfied (return true) or there are none left (return false).
1717 // final_spin is TRUE if this is the spin at the release barrier.
1718 // thread_finished indicates whether the thread is finished executing all
1719 // the tasks it has on its deque, and is at the release barrier.
1720 // spinner is the location on which to spin.
1721 // spinner == NULL means only execute a single task and return.
1722 // checker is the value to check to terminate the spin.
1723 template <class C>
1724 static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
1725  int *thread_finished
1726  USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
1727 {
1728  kmp_task_team_t * task_team;
1729  kmp_thread_data_t * threads_data;
1730  kmp_task_t * task;
1731  kmp_taskdata_t * current_task = thread -> th.th_current_task;
1732  volatile kmp_uint32 * unfinished_threads;
1733  kmp_int32 nthreads, last_stolen, k, tid;
1734 
1735  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1736  KMP_DEBUG_ASSERT( thread == __kmp_threads[ gtid ] );
1737 
1738  task_team = thread -> th.th_task_team;
1739  if (task_team == NULL) return FALSE;
1740 
1741  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d *thread_finished=%d\n",
1742  gtid, final_spin, *thread_finished) );
1743 
1744  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
1745  KMP_DEBUG_ASSERT( threads_data != NULL );
1746 
1747  nthreads = task_team -> tt.tt_nproc;
1748  unfinished_threads = &(task_team -> tt.tt_unfinished_threads);
1749 #if OMP_41_ENABLED
1750  KMP_DEBUG_ASSERT( nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
1751 #else
1752  KMP_DEBUG_ASSERT( nthreads > 1 );
1753 #endif
1754  KMP_DEBUG_ASSERT( TCR_4((int)*unfinished_threads) >= 0 );
1755 
1756  // Choose tasks from our own work queue.
1757  start:
1758  while (( task = __kmp_remove_my_task( thread, gtid, task_team, is_constrained )) != NULL ) {
1759 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1760  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
1761  if ( itt_sync_obj == NULL ) {
1762  // we are at fork barrier where we could not get the object reliably
1763  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1764  }
1765  __kmp_itt_task_starting( itt_sync_obj );
1766  }
1767 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1768  __kmp_invoke_task( gtid, task, current_task );
1769 #if USE_ITT_BUILD
1770  if ( itt_sync_obj != NULL )
1771  __kmp_itt_task_finished( itt_sync_obj );
1772 #endif /* USE_ITT_BUILD */
1773 
1774  // If this thread is only partway through the barrier and the condition
1775  // is met, then return now, so that the barrier gather/release pattern can proceed.
1776  // If this thread is in the last spin loop in the barrier, waiting to be
1777  // released, we know that the termination condition will not be satisified,
1778  // so don't waste any cycles checking it.
1779  if (flag == NULL || (!final_spin && flag->done_check())) {
1780  KA_TRACE(15, ("__kmp_execute_tasks_template(exit #1): T#%d spin condition satisfied\n", gtid) );
1781  return TRUE;
1782  }
1783  if (thread->th.th_task_team == NULL) break;
1784  KMP_YIELD( __kmp_library == library_throughput ); // Yield before executing next task
1785  }
1786 
1787  // This thread's work queue is empty. If we are in the final spin loop
1788  // of the barrier, check and see if the termination condition is satisfied.
1789 #if OMP_41_ENABLED
1790  // The work queue may be empty but there might be proxy tasks still executing
1791  if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
1792 #else
1793  if (final_spin)
1794 #endif
1795  {
1796  // First, decrement the #unfinished threads, if that has not already
1797  // been done. This decrement might be to the spin location, and
1798  // result in the termination condition being satisfied.
1799  if (! *thread_finished) {
1800  kmp_uint32 count;
1801 
1802  count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
1803  KA_TRACE(20, ("__kmp_execute_tasks_template(dec #1): T#%d dec unfinished_threads to %d task_team=%p\n",
1804  gtid, count, task_team) );
1805  *thread_finished = TRUE;
1806  }
1807 
1808  // It is now unsafe to reference thread->th.th_team !!!
1809  // Decrementing task_team->tt.tt_unfinished_threads can allow the master
1810  // thread to pass through the barrier, where it might reset each thread's
1811  // th.th_team field for the next parallel region.
1812  // If we can steal more work, we know that this has not happened yet.
1813  if (flag != NULL && flag->done_check()) {
1814  KA_TRACE(15, ("__kmp_execute_tasks_template(exit #2): T#%d spin condition satisfied\n", gtid) );
1815  return TRUE;
1816  }
1817  }
1818 
1819  if (thread->th.th_task_team == NULL) return FALSE;
1820 #if OMP_41_ENABLED
1821  // check if there are other threads to steal from, otherwise go back
1822  if ( nthreads == 1 )
1823  goto start;
1824 #endif
1825 
1826  // Try to steal from the last place I stole from successfully.
1827  tid = thread -> th.th_info.ds.ds_tid;//__kmp_tid_from_gtid( gtid );
1828  last_stolen = threads_data[ tid ].td.td_deque_last_stolen;
1829 
1830  if (last_stolen != -1) {
1831  kmp_info_t *other_thread = threads_data[last_stolen].td.td_thr;
1832 
1833  while ((task = __kmp_steal_task( other_thread, gtid, task_team, unfinished_threads,
1834  thread_finished, is_constrained )) != NULL)
1835  {
1836 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1837  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
1838  if ( itt_sync_obj == NULL ) {
1839  // we are at fork barrier where we could not get the object reliably
1840  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1841  }
1842  __kmp_itt_task_starting( itt_sync_obj );
1843  }
1844 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1845  __kmp_invoke_task( gtid, task, current_task );
1846 #if USE_ITT_BUILD
1847  if ( itt_sync_obj != NULL )
1848  __kmp_itt_task_finished( itt_sync_obj );
1849 #endif /* USE_ITT_BUILD */
1850 
1851  // Check to see if this thread can proceed.
1852  if (flag == NULL || (!final_spin && flag->done_check())) {
1853  KA_TRACE(15, ("__kmp_execute_tasks_template(exit #3): T#%d spin condition satisfied\n",
1854  gtid) );
1855  return TRUE;
1856  }
1857 
1858  if (thread->th.th_task_team == NULL) break;
1859  KMP_YIELD( __kmp_library == library_throughput ); // Yield before executing next task
1860  // If the execution of the stolen task resulted in more tasks being
1861  // placed on our run queue, then restart the whole process.
1862  if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) {
1863  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n",
1864  gtid) );
1865  goto start;
1866  }
1867  }
1868 
1869  // Don't give priority to stealing from this thread anymore.
1870  threads_data[ tid ].td.td_deque_last_stolen = -1;
1871 
1872  // The victims's work queue is empty. If we are in the final spin loop
1873  // of the barrier, check and see if the termination condition is satisfied.
1874 #if OMP_41_ENABLED
1875  // The work queue may be empty but there might be proxy tasks still executing
1876  if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
1877 #else
1878  if (final_spin)
1879 #endif
1880  {
1881  // First, decrement the #unfinished threads, if that has not already
1882  // been done. This decrement might be to the spin location, and
1883  // result in the termination condition being satisfied.
1884  if (! *thread_finished) {
1885  kmp_uint32 count;
1886 
1887  count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
1888  KA_TRACE(20, ("__kmp_execute_tasks_template(dec #2): T#%d dec unfinished_threads to %d "
1889  "task_team=%p\n", gtid, count, task_team) );
1890  *thread_finished = TRUE;
1891  }
1892 
1893  // If __kmp_tasking_mode != tskm_immediate_exec
1894  // then it is now unsafe to reference thread->th.th_team !!!
1895  // Decrementing task_team->tt.tt_unfinished_threads can allow the master
1896  // thread to pass through the barrier, where it might reset each thread's
1897  // th.th_team field for the next parallel region.
1898  // If we can steal more work, we know that this has not happened yet.
1899  if (flag != NULL && flag->done_check()) {
1900  KA_TRACE(15, ("__kmp_execute_tasks_template(exit #4): T#%d spin condition satisfied\n",
1901  gtid) );
1902  return TRUE;
1903  }
1904  }
1905  if (thread->th.th_task_team == NULL) return FALSE;
1906  }
1907 
1908  // Find a different thread to steal work from. Pick a random thread.
1909  // My initial plan was to cycle through all the threads, and only return
1910  // if we tried to steal from every thread, and failed. Arch says that's
1911  // not such a great idea.
1912  // GEH - need yield code in this loop for throughput library mode?
1913  new_victim:
1914  k = __kmp_get_random( thread ) % (nthreads - 1);
1915  if ( k >= thread -> th.th_info.ds.ds_tid ) {
1916  ++k; // Adjusts random distribution to exclude self
1917  }
1918  {
1919  kmp_info_t *other_thread = threads_data[k].td.td_thr;
1920  int first;
1921 
1922  // There is a slight chance that __kmp_enable_tasking() did not wake up
1923  // all threads waiting at the barrier. If this thread is sleeping, then
1924  // wake it up. Since we were going to pay the cache miss penalty
1925  // for referencing another thread's kmp_info_t struct anyway, the check
1926  // shouldn't cost too much performance at this point.
1927  // In extra barrier mode, tasks do not sleep at the separate tasking
1928  // barrier, so this isn't a problem.
1929  if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
1930  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
1931  (TCR_PTR(other_thread->th.th_sleep_loc) != NULL))
1932  {
1933  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc);
1934  // A sleeping thread should not have any tasks on it's queue.
1935  // There is a slight possibility that it resumes, steals a task from
1936  // another thread, which spawns more tasks, all in the time that it takes
1937  // this thread to check => don't write an assertion that the victim's
1938  // queue is empty. Try stealing from a different thread.
1939  goto new_victim;
1940  }
1941 
1942  // Now try to steal work from the selected thread
1943  first = TRUE;
1944  while ((task = __kmp_steal_task( other_thread, gtid, task_team, unfinished_threads,
1945  thread_finished, is_constrained )) != NULL)
1946  {
1947 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1948  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
1949  if ( itt_sync_obj == NULL ) {
1950  // we are at fork barrier where we could not get the object reliably
1951  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1952  }
1953  __kmp_itt_task_starting( itt_sync_obj );
1954  }
1955 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1956  __kmp_invoke_task( gtid, task, current_task );
1957 #if USE_ITT_BUILD
1958  if ( itt_sync_obj != NULL )
1959  __kmp_itt_task_finished( itt_sync_obj );
1960 #endif /* USE_ITT_BUILD */
1961 
1962  // Try stealing from this victim again, in the future.
1963  if (first) {
1964  threads_data[ tid ].td.td_deque_last_stolen = k;
1965  first = FALSE;
1966  }
1967 
1968  // Check to see if this thread can proceed.
1969  if (flag == NULL || (!final_spin && flag->done_check())) {
1970  KA_TRACE(15, ("__kmp_execute_tasks_template(exit #5): T#%d spin condition satisfied\n",
1971  gtid) );
1972  return TRUE;
1973  }
1974  if (thread->th.th_task_team == NULL) break;
1975  KMP_YIELD( __kmp_library == library_throughput ); // Yield before executing next task
1976 
1977  // If the execution of the stolen task resulted in more tasks being
1978  // placed on our run queue, then restart the whole process.
1979  if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) {
1980  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n",
1981  gtid) );
1982  goto start;
1983  }
1984  }
1985 
1986  // The victims's work queue is empty. If we are in the final spin loop
1987  // of the barrier, check and see if the termination condition is satisfied.
1988  // Going on and finding a new victim to steal from is expensive, as it
1989  // involves a lot of cache misses, so we definitely want to re-check the
1990  // termination condition before doing that.
1991 #if OMP_41_ENABLED
1992  // The work queue may be empty but there might be proxy tasks still executing
1993  if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
1994 #else
1995  if (final_spin)
1996 #endif
1997  {
1998  // First, decrement the #unfinished threads, if that has not already
1999  // been done. This decrement might be to the spin location, and
2000  // result in the termination condition being satisfied.
2001  if (! *thread_finished) {
2002  kmp_uint32 count;
2003 
2004  count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
2005  KA_TRACE(20, ("__kmp_execute_tasks_template(dec #3): T#%d dec unfinished_threads to %d; "
2006  "task_team=%p\n",
2007  gtid, count, task_team) );
2008  *thread_finished = TRUE;
2009  }
2010 
2011  // If __kmp_tasking_mode != tskm_immediate_exec,
2012  // then it is now unsafe to reference thread->th.th_team !!!
2013  // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2014  // thread to pass through the barrier, where it might reset each thread's
2015  // th.th_team field for the next parallel region.
2016  // If we can steal more work, we know that this has not happened yet.
2017  if (flag != NULL && flag->done_check()) {
2018  KA_TRACE(15, ("__kmp_execute_tasks_template(exit #6): T#%d spin condition satisfied\n", gtid) );
2019  return TRUE;
2020  }
2021  }
2022  if (thread->th.th_task_team == NULL) return FALSE;
2023  }
2024 
2025  KA_TRACE(15, ("__kmp_execute_tasks_template(exit #7): T#%d can't find work\n", gtid) );
2026  return FALSE;
2027 }
2028 
2029 int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2030  int *thread_finished
2031  USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2032 {
2033  return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2034  USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2035 }
2036 
2037 int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2038  int *thread_finished
2039  USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2040 {
2041  return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2042  USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2043 }
2044 
2045 int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2046  int *thread_finished
2047  USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2048 {
2049  return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2050  USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2051 }
2052 
2053 
2054 
2055 //-----------------------------------------------------------------------------
2056 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2057 // next barrier so they can assist in executing enqueued tasks.
2058 // First thread in allocates the task team atomically.
2059 
2060 static void
2061 __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
2062 {
2063  kmp_thread_data_t *threads_data;
2064  int nthreads, i, is_init_thread;
2065 
2066  KA_TRACE( 10, ( "__kmp_enable_tasking(enter): T#%d\n",
2067  __kmp_gtid_from_thread( this_thr ) ) );
2068 
2069  KMP_DEBUG_ASSERT(task_team != NULL);
2070  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2071 
2072  nthreads = task_team->tt.tt_nproc;
2073  KMP_DEBUG_ASSERT(nthreads > 0);
2074  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2075 
2076  // Allocate or increase the size of threads_data if necessary
2077  is_init_thread = __kmp_realloc_task_threads_data( this_thr, task_team );
2078 
2079  if (!is_init_thread) {
2080  // Some other thread already set up the array.
2081  KA_TRACE( 20, ( "__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2082  __kmp_gtid_from_thread( this_thr ) ) );
2083  return;
2084  }
2085  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
2086  KMP_DEBUG_ASSERT( threads_data != NULL );
2087 
2088  if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
2089  ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) )
2090  {
2091  // Release any threads sleeping at the barrier, so that they can steal
2092  // tasks and execute them. In extra barrier mode, tasks do not sleep
2093  // at the separate tasking barrier, so this isn't a problem.
2094  for (i = 0; i < nthreads; i++) {
2095  volatile void *sleep_loc;
2096  kmp_info_t *thread = threads_data[i].td.td_thr;
2097 
2098  if (i == this_thr->th.th_info.ds.ds_tid) {
2099  continue;
2100  }
2101  // Since we haven't locked the thread's suspend mutex lock at this
2102  // point, there is a small window where a thread might be putting
2103  // itself to sleep, but hasn't set the th_sleep_loc field yet.
2104  // To work around this, __kmp_execute_tasks_template() periodically checks
2105  // see if other threads are sleeping (using the same random
2106  // mechanism that is used for task stealing) and awakens them if
2107  // they are.
2108  if ( ( sleep_loc = TCR_PTR( thread -> th.th_sleep_loc) ) != NULL )
2109  {
2110  KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2111  __kmp_gtid_from_thread( this_thr ),
2112  __kmp_gtid_from_thread( thread ) ) );
2113  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2114  }
2115  else {
2116  KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2117  __kmp_gtid_from_thread( this_thr ),
2118  __kmp_gtid_from_thread( thread ) ) );
2119  }
2120  }
2121  }
2122 
2123  KA_TRACE( 10, ( "__kmp_enable_tasking(exit): T#%d\n",
2124  __kmp_gtid_from_thread( this_thr ) ) );
2125 }
2126 
2127 
2128 /* ------------------------------------------------------------------------ */
2129 /* // TODO: Check the comment consistency
2130  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
2131  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2132  * After a child * thread checks into a barrier and calls __kmp_release() from
2133  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2134  * longer assume that the kmp_team_t structure is intact (at any moment, the
2135  * master thread may exit the barrier code and free the team data structure,
2136  * and return the threads to the thread pool).
2137  *
2138  * This does not work with the the tasking code, as the thread is still
2139  * expected to participate in the execution of any tasks that may have been
2140  * spawned my a member of the team, and the thread still needs access to all
2141  * to each thread in the team, so that it can steal work from it.
2142  *
2143  * Enter the existence of the kmp_task_team_t struct. It employs a reference
2144  * counting mechanims, and is allocated by the master thread before calling
2145  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2146  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
2147  * of the kmp_task_team_t structs for consecutive barriers can overlap
2148  * (and will, unless the master thread is the last thread to exit the barrier
2149  * release phase, which is not typical).
2150  *
2151  * The existence of such a struct is useful outside the context of tasking,
2152  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2153  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2154  * libraries.
2155  *
2156  * We currently use the existence of the threads array as an indicator that
2157  * tasks were spawned since the last barrier. If the structure is to be
2158  * useful outside the context of tasking, then this will have to change, but
2159  * not settting the field minimizes the performance impact of tasking on
2160  * barriers, when no explicit tasks were spawned (pushed, actually).
2161  */
2162 
2163 
2164 static kmp_task_team_t *__kmp_free_task_teams = NULL; // Free list for task_team data structures
2165 // Lock for task team data structures
2166 static kmp_bootstrap_lock_t __kmp_task_team_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_task_team_lock );
2167 
2168 
2169 //------------------------------------------------------------------------------
2170 // __kmp_alloc_task_deque:
2171 // Allocates a task deque for a particular thread, and initialize the necessary
2172 // data structures relating to the deque. This only happens once per thread
2173 // per task team since task teams are recycled.
2174 // No lock is needed during allocation since each thread allocates its own
2175 // deque.
2176 
2177 static void
2178 __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data )
2179 {
2180  __kmp_init_bootstrap_lock( & thread_data -> td.td_deque_lock );
2181  KMP_DEBUG_ASSERT( thread_data -> td.td_deque == NULL );
2182 
2183  // Initialize last stolen task field to "none"
2184  thread_data -> td.td_deque_last_stolen = -1;
2185 
2186  KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) == 0 );
2187  KMP_DEBUG_ASSERT( thread_data -> td.td_deque_head == 0 );
2188  KMP_DEBUG_ASSERT( thread_data -> td.td_deque_tail == 0 );
2189 
2190  KE_TRACE( 10, ( "__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2191  __kmp_gtid_from_thread( thread ), TASK_DEQUE_SIZE, thread_data ) );
2192  // Allocate space for task deque, and zero the deque
2193  // Cannot use __kmp_thread_calloc() because threads not around for
2194  // kmp_reap_task_team( ).
2195  thread_data -> td.td_deque = (kmp_taskdata_t **)
2196  __kmp_allocate( TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2197 }
2198 
2199 
2200 //------------------------------------------------------------------------------
2201 // __kmp_free_task_deque:
2202 // Deallocates a task deque for a particular thread.
2203 // Happens at library deallocation so don't need to reset all thread data fields.
2204 
2205 static void
2206 __kmp_free_task_deque( kmp_thread_data_t *thread_data )
2207 {
2208  __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
2209 
2210  if ( thread_data -> td.td_deque != NULL ) {
2211  TCW_4(thread_data -> td.td_deque_ntasks, 0);
2212  __kmp_free( thread_data -> td.td_deque );
2213  thread_data -> td.td_deque = NULL;
2214  }
2215  __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
2216 
2217 #ifdef BUILD_TIED_TASK_STACK
2218  // GEH: Figure out what to do here for td_susp_tied_tasks
2219  if ( thread_data -> td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY ) {
2220  __kmp_free_task_stack( __kmp_thread_from_gtid( gtid ), thread_data );
2221  }
2222 #endif // BUILD_TIED_TASK_STACK
2223 }
2224 
2225 
2226 //------------------------------------------------------------------------------
2227 // __kmp_realloc_task_threads_data:
2228 // Allocates a threads_data array for a task team, either by allocating an initial
2229 // array or enlarging an existing array. Only the first thread to get the lock
2230 // allocs or enlarges the array and re-initializes the array eleemnts.
2231 // That thread returns "TRUE", the rest return "FALSE".
2232 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2233 // The current size is given by task_team -> tt.tt_max_threads.
2234 
2235 static int
2236 __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team )
2237 {
2238  kmp_thread_data_t ** threads_data_p;
2239  kmp_int32 nthreads, maxthreads;
2240  int is_init_thread = FALSE;
2241 
2242  if ( TCR_4(task_team -> tt.tt_found_tasks) ) {
2243  // Already reallocated and initialized.
2244  return FALSE;
2245  }
2246 
2247  threads_data_p = & task_team -> tt.tt_threads_data;
2248  nthreads = task_team -> tt.tt_nproc;
2249  maxthreads = task_team -> tt.tt_max_threads;
2250 
2251  // All threads must lock when they encounter the first task of the implicit task
2252  // region to make sure threads_data fields are (re)initialized before used.
2253  __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2254 
2255  if ( ! TCR_4(task_team -> tt.tt_found_tasks) ) {
2256  // first thread to enable tasking
2257  kmp_team_t *team = thread -> th.th_team;
2258  int i;
2259 
2260  is_init_thread = TRUE;
2261  if ( maxthreads < nthreads ) {
2262 
2263  if ( *threads_data_p != NULL ) {
2264  kmp_thread_data_t *old_data = *threads_data_p;
2265  kmp_thread_data_t *new_data = NULL;
2266 
2267  KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d reallocating "
2268  "threads data for task_team %p, new_size = %d, old_size = %d\n",
2269  __kmp_gtid_from_thread( thread ), task_team,
2270  nthreads, maxthreads ) );
2271  // Reallocate threads_data to have more elements than current array
2272  // Cannot use __kmp_thread_realloc() because threads not around for
2273  // kmp_reap_task_team( ). Note all new array entries are initialized
2274  // to zero by __kmp_allocate().
2275  new_data = (kmp_thread_data_t *)
2276  __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
2277  // copy old data to new data
2278  KMP_MEMCPY_S( (void *) new_data, nthreads * sizeof(kmp_thread_data_t),
2279  (void *) old_data,
2280  maxthreads * sizeof(kmp_taskdata_t *) );
2281 
2282 #ifdef BUILD_TIED_TASK_STACK
2283  // GEH: Figure out if this is the right thing to do
2284  for (i = maxthreads; i < nthreads; i++) {
2285  kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2286  __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
2287  }
2288 #endif // BUILD_TIED_TASK_STACK
2289  // Install the new data and free the old data
2290  (*threads_data_p) = new_data;
2291  __kmp_free( old_data );
2292  }
2293  else {
2294  KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d allocating "
2295  "threads data for task_team %p, size = %d\n",
2296  __kmp_gtid_from_thread( thread ), task_team, nthreads ) );
2297  // Make the initial allocate for threads_data array, and zero entries
2298  // Cannot use __kmp_thread_calloc() because threads not around for
2299  // kmp_reap_task_team( ).
2300  *threads_data_p = (kmp_thread_data_t *)
2301  __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
2302 #ifdef BUILD_TIED_TASK_STACK
2303  // GEH: Figure out if this is the right thing to do
2304  for (i = 0; i < nthreads; i++) {
2305  kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2306  __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
2307  }
2308 #endif // BUILD_TIED_TASK_STACK
2309  }
2310  task_team -> tt.tt_max_threads = nthreads;
2311  }
2312  else {
2313  // If array has (more than) enough elements, go ahead and use it
2314  KMP_DEBUG_ASSERT( *threads_data_p != NULL );
2315  }
2316 
2317  // initialize threads_data pointers back to thread_info structures
2318  for (i = 0; i < nthreads; i++) {
2319  kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2320  thread_data -> td.td_thr = team -> t.t_threads[i];
2321 
2322  if ( thread_data -> td.td_deque_last_stolen >= nthreads) {
2323  // The last stolen field survives across teams / barrier, and the number
2324  // of threads may have changed. It's possible (likely?) that a new
2325  // parallel region will exhibit the same behavior as the previous region.
2326  thread_data -> td.td_deque_last_stolen = -1;
2327  }
2328  }
2329 
2330  KMP_MB();
2331  TCW_SYNC_4(task_team -> tt.tt_found_tasks, TRUE);
2332  }
2333 
2334  __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2335  return is_init_thread;
2336 }
2337 
2338 
2339 //------------------------------------------------------------------------------
2340 // __kmp_free_task_threads_data:
2341 // Deallocates a threads_data array for a task team, including any attached
2342 // tasking deques. Only occurs at library shutdown.
2343 
2344 static void
2345 __kmp_free_task_threads_data( kmp_task_team_t *task_team )
2346 {
2347  __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2348  if ( task_team -> tt.tt_threads_data != NULL ) {
2349  int i;
2350  for (i = 0; i < task_team->tt.tt_max_threads; i++ ) {
2351  __kmp_free_task_deque( & task_team -> tt.tt_threads_data[i] );
2352  }
2353  __kmp_free( task_team -> tt.tt_threads_data );
2354  task_team -> tt.tt_threads_data = NULL;
2355  }
2356  __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2357 }
2358 
2359 
2360 //------------------------------------------------------------------------------
2361 // __kmp_allocate_task_team:
2362 // Allocates a task team associated with a specific team, taking it from
2363 // the global task team free list if possible. Also initializes data structures.
2364 
2365 static kmp_task_team_t *
2366 __kmp_allocate_task_team( kmp_info_t *thread, kmp_team_t *team )
2367 {
2368  kmp_task_team_t *task_team = NULL;
2369  int nthreads;
2370 
2371  KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d entering; team = %p\n",
2372  (thread ? __kmp_gtid_from_thread( thread ) : -1), team ) );
2373 
2374  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2375  // Take a task team from the task team pool
2376  __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
2377  if (__kmp_free_task_teams != NULL) {
2378  task_team = __kmp_free_task_teams;
2379  TCW_PTR(__kmp_free_task_teams, task_team -> tt.tt_next);
2380  task_team -> tt.tt_next = NULL;
2381  }
2382  __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
2383  }
2384 
2385  if (task_team == NULL) {
2386  KE_TRACE( 10, ( "__kmp_allocate_task_team: T#%d allocating "
2387  "task team for team %p\n",
2388  __kmp_gtid_from_thread( thread ), team ) );
2389  // Allocate a new task team if one is not available.
2390  // Cannot use __kmp_thread_malloc() because threads not around for
2391  // kmp_reap_task_team( ).
2392  task_team = (kmp_task_team_t *) __kmp_allocate( sizeof(kmp_task_team_t) );
2393  __kmp_init_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2394  //task_team -> tt.tt_threads_data = NULL; // AC: __kmp_allocate zeroes returned memory
2395  //task_team -> tt.tt_max_threads = 0;
2396  //task_team -> tt.tt_next = NULL;
2397  }
2398 
2399  TCW_4(task_team -> tt.tt_found_tasks, FALSE);
2400 #if OMP_41_ENABLED
2401  TCW_4(task_team -> tt.tt_found_proxy_tasks, FALSE);
2402 #endif
2403  task_team -> tt.tt_nproc = nthreads = team->t.t_nproc;
2404 
2405  TCW_4( task_team -> tt.tt_unfinished_threads, nthreads );
2406  TCW_4( task_team -> tt.tt_active, TRUE );
2407 
2408  KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d exiting; task_team = %p unfinished_threads init'd to %d\n",
2409  (thread ? __kmp_gtid_from_thread( thread ) : -1), task_team, task_team -> tt.tt_unfinished_threads) );
2410  return task_team;
2411 }
2412 
2413 
2414 //------------------------------------------------------------------------------
2415 // __kmp_free_task_team:
2416 // Frees the task team associated with a specific thread, and adds it
2417 // to the global task team free list.
2418 
2419 void
2420 __kmp_free_task_team( kmp_info_t *thread, kmp_task_team_t *task_team )
2421 {
2422  KA_TRACE( 20, ( "__kmp_free_task_team: T#%d task_team = %p\n",
2423  thread ? __kmp_gtid_from_thread( thread ) : -1, task_team ) );
2424 
2425  // Put task team back on free list
2426  __kmp_acquire_bootstrap_lock( & __kmp_task_team_lock );
2427 
2428  KMP_DEBUG_ASSERT( task_team -> tt.tt_next == NULL );
2429  task_team -> tt.tt_next = __kmp_free_task_teams;
2430  TCW_PTR(__kmp_free_task_teams, task_team);
2431 
2432  __kmp_release_bootstrap_lock( & __kmp_task_team_lock );
2433 }
2434 
2435 
2436 //------------------------------------------------------------------------------
2437 // __kmp_reap_task_teams:
2438 // Free all the task teams on the task team free list.
2439 // Should only be done during library shutdown.
2440 // Cannot do anything that needs a thread structure or gtid since they are already gone.
2441 
2442 void
2443 __kmp_reap_task_teams( void )
2444 {
2445  kmp_task_team_t *task_team;
2446 
2447  if ( TCR_PTR(__kmp_free_task_teams) != NULL ) {
2448  // Free all task_teams on the free list
2449  __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
2450  while ( ( task_team = __kmp_free_task_teams ) != NULL ) {
2451  __kmp_free_task_teams = task_team -> tt.tt_next;
2452  task_team -> tt.tt_next = NULL;
2453 
2454  // Free threads_data if necessary
2455  if ( task_team -> tt.tt_threads_data != NULL ) {
2456  __kmp_free_task_threads_data( task_team );
2457  }
2458  __kmp_free( task_team );
2459  }
2460  __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
2461  }
2462 }
2463 
2464 //------------------------------------------------------------------------------
2465 // __kmp_wait_to_unref_task_teams:
2466 // Some threads could still be in the fork barrier release code, possibly
2467 // trying to steal tasks. Wait for each thread to unreference its task team.
2468 //
2469 void
2470 __kmp_wait_to_unref_task_teams(void)
2471 {
2472  kmp_info_t *thread;
2473  kmp_uint32 spins;
2474  int done;
2475 
2476  KMP_INIT_YIELD( spins );
2477 
2478 
2479  for (;;) {
2480  done = TRUE;
2481 
2482  // TODO: GEH - this may be is wrong because some sync would be necessary
2483  // in case threads are added to the pool during the traversal.
2484  // Need to verify that lock for thread pool is held when calling
2485  // this routine.
2486  for (thread = (kmp_info_t *)__kmp_thread_pool;
2487  thread != NULL;
2488  thread = thread->th.th_next_pool)
2489  {
2490 #if KMP_OS_WINDOWS
2491  DWORD exit_val;
2492 #endif
2493  if ( TCR_PTR(thread->th.th_task_team) == NULL ) {
2494  KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
2495  __kmp_gtid_from_thread( thread ) ) );
2496  continue;
2497  }
2498 #if KMP_OS_WINDOWS
2499  // TODO: GEH - add this check for Linux* OS / OS X* as well?
2500  if (!__kmp_is_thread_alive(thread, &exit_val)) {
2501  thread->th.th_task_team = NULL;
2502  continue;
2503  }
2504 #endif
2505 
2506  done = FALSE; // Because th_task_team pointer is not NULL for this thread
2507 
2508  KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to unreference task_team\n",
2509  __kmp_gtid_from_thread( thread ) ) );
2510 
2511  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
2512  volatile void *sleep_loc;
2513  // If the thread is sleeping, awaken it.
2514  if ( ( sleep_loc = TCR_PTR( thread->th.th_sleep_loc) ) != NULL ) {
2515  KA_TRACE( 10, ( "__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
2516  __kmp_gtid_from_thread( thread ), __kmp_gtid_from_thread( thread ) ) );
2517  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2518  }
2519  }
2520  }
2521  if (done) {
2522  break;
2523  }
2524 
2525  // If we are oversubscribed,
2526  // or have waited a bit (and library mode is throughput), yield.
2527  // Pause is in the following code.
2528  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2529  KMP_YIELD_SPIN( spins ); // Yields only if KMP_LIBRARY=throughput
2530  }
2531 
2532 
2533 }
2534 
2535 
2536 //------------------------------------------------------------------------------
2537 // __kmp_task_team_setup: Create a task_team for the current team, but use
2538 // an already created, unused one if it already exists.
2539 void
2540 __kmp_task_team_setup( kmp_info_t *this_thr, kmp_team_t *team, int always )
2541 {
2542  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2543 
2544  // If this task_team hasn't been created yet, allocate it. It will be used in the region after the next.
2545  // If it exists, it is the current task team and shouldn't be touched yet as it may still be in use.
2546  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && (always || team->t.t_nproc > 1) ) {
2547  team->t.t_task_team[this_thr->th.th_task_state] = __kmp_allocate_task_team( this_thr, team );
2548  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p for team %d at parity=%d\n",
2549  __kmp_gtid_from_thread(this_thr), team->t.t_task_team[this_thr->th.th_task_state],
2550  ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2551  }
2552 
2553  // After threads exit the release, they will call sync, and then point to this other task_team; make sure it is
2554  // allocated and properly initialized. As threads spin in the barrier release phase, they will continue to use the
2555  // previous task_team struct(above), until they receive the signal to stop checking for tasks (they can't safely
2556  // reference the kmp_team_t struct, which could be reallocated by the master thread). No task teams are formed for
2557  // serialized teams.
2558  if (team->t.t_nproc > 1) {
2559  int other_team = 1 - this_thr->th.th_task_state;
2560  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
2561  team->t.t_task_team[other_team] = __kmp_allocate_task_team( this_thr, team );
2562  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new task_team %p for team %d at parity=%d\n",
2563  __kmp_gtid_from_thread( this_thr ), team->t.t_task_team[other_team],
2564  ((team != NULL) ? team->t.t_id : -1), other_team ));
2565  }
2566  else { // Leave the old task team struct in place for the upcoming region; adjust as needed
2567  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
2568  if (!task_team->tt.tt_active || team->t.t_nproc != task_team->tt.tt_nproc) {
2569  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
2570  TCW_4(task_team->tt.tt_found_tasks, FALSE);
2571 #if OMP_41_ENABLED
2572  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2573 #endif
2574  TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc );
2575  TCW_4(task_team->tt.tt_active, TRUE );
2576  }
2577  // if team size has changed, the first thread to enable tasking will realloc threads_data if necessary
2578  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team %p for team %d at parity=%d\n",
2579  __kmp_gtid_from_thread( this_thr ), team->t.t_task_team[other_team],
2580  ((team != NULL) ? team->t.t_id : -1), other_team ));
2581  }
2582  }
2583 }
2584 
2585 
2586 //------------------------------------------------------------------------------
2587 // __kmp_task_team_sync: Propagation of task team data from team to threads
2588 // which happens just after the release phase of a team barrier. This may be
2589 // called by any thread, but only for teams with # threads > 1.
2590 
2591 void
2592 __kmp_task_team_sync( kmp_info_t *this_thr, kmp_team_t *team )
2593 {
2594  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2595 
2596  // Toggle the th_task_state field, to switch which task_team this thread refers to
2597  this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
2598  // It is now safe to propagate the task team pointer from the team struct to the current thread.
2599  TCW_PTR(this_thr->th.th_task_team, team->t.t_task_team[this_thr->th.th_task_state]);
2600  KA_TRACE(20, ("__kmp_task_team_sync: Thread T#%d task team switched to task_team %p from Team #%d (parity=%d)\n",
2601  __kmp_gtid_from_thread( this_thr ), this_thr->th.th_task_team,
2602  ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2603 }
2604 
2605 
2606 //--------------------------------------------------------------------------------------------
2607 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the barrier gather
2608 // phase. Only called by master thread if #threads in team > 1 or if proxy tasks were created.
2609 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off by passing in 0
2610 // optionally as the last argument. When wait is zero, master thread does not wait for
2611 // unfinished_threads to reach 0.
2612 void
2613 __kmp_task_team_wait( kmp_info_t *this_thr, kmp_team_t *team
2614  USE_ITT_BUILD_ARG(void * itt_sync_obj)
2615  , int wait)
2616 {
2617  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
2618 
2619  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2620  KMP_DEBUG_ASSERT( task_team == this_thr->th.th_task_team );
2621 
2622  if ( ( task_team != NULL ) && KMP_TASKING_ENABLED(task_team) ) {
2623  if (wait) {
2624  KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks (for unfinished_threads to reach 0) on task_team = %p\n",
2625  __kmp_gtid_from_thread(this_thr), task_team));
2626  // Worker threads may have dropped through to release phase, but could still be executing tasks. Wait
2627  // here for tasks to complete. To avoid memory contention, only master thread checks termination condition.
2628  kmp_flag_32 flag(&task_team->tt.tt_unfinished_threads, 0U);
2629  flag.wait(this_thr, TRUE
2630  USE_ITT_BUILD_ARG(itt_sync_obj));
2631  }
2632  // Deactivate the old task team, so that the worker threads will stop referencing it while spinning.
2633  KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
2634  "setting active to false, setting local and team's pointer to NULL\n",
2635  __kmp_gtid_from_thread(this_thr), task_team));
2636 #if OMP_41_ENABLED
2637  KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 || task_team->tt.tt_found_proxy_tasks == TRUE );
2638  TCW_SYNC_4( task_team->tt.tt_found_proxy_tasks, FALSE );
2639 #else
2640  KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 );
2641 #endif
2642  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
2643  KMP_MB();
2644 
2645  TCW_PTR(this_thr->th.th_task_team, NULL);
2646  }
2647 }
2648 
2649 
2650 //------------------------------------------------------------------------------
2651 // __kmp_tasking_barrier:
2652 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
2653 // Internal function to execute all tasks prior to a regular barrier or a
2654 // join barrier. It is a full barrier itself, which unfortunately turns
2655 // regular barriers into double barriers and join barriers into 1 1/2
2656 // barriers.
2657 void
2658 __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid )
2659 {
2660  volatile kmp_uint32 *spin = &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads;
2661  int flag = FALSE;
2662  KMP_DEBUG_ASSERT( __kmp_tasking_mode == tskm_extra_barrier );
2663 
2664 #if USE_ITT_BUILD
2665  KMP_FSYNC_SPIN_INIT( spin, (kmp_uint32*) NULL );
2666 #endif /* USE_ITT_BUILD */
2667  kmp_flag_32 spin_flag(spin, 0U);
2668  while (! spin_flag.execute_tasks(thread, gtid, TRUE, &flag
2669  USE_ITT_BUILD_ARG(NULL), 0 ) ) {
2670 #if USE_ITT_BUILD
2671  // TODO: What about itt_sync_obj??
2672  KMP_FSYNC_SPIN_PREPARE( spin );
2673 #endif /* USE_ITT_BUILD */
2674 
2675  if( TCR_4(__kmp_global.g.g_done) ) {
2676  if( __kmp_global.g.g_abort )
2677  __kmp_abort_thread( );
2678  break;
2679  }
2680  KMP_YIELD( TRUE ); // GH: We always yield here
2681  }
2682 #if USE_ITT_BUILD
2683  KMP_FSYNC_SPIN_ACQUIRED( (void*) spin );
2684 #endif /* USE_ITT_BUILD */
2685 }
2686 
2687 
2688 #if OMP_41_ENABLED
2689 
2690 /* __kmp_give_task puts a task into a given thread queue if:
2691  - the queue for that thread it was created
2692  - there's space in that queue
2693 
2694  Because of this, __kmp_push_task needs to check if there's space after getting the lock
2695  */
2696 static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task )
2697 {
2698  kmp_task_team_t * task_team = thread->th.th_task_team;
2699  kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
2700  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
2701  bool result = false;
2702 
2703  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", taskdata, tid ) );
2704 
2705  // assert tasking is enabled? what if not?
2706  KMP_DEBUG_ASSERT( task_team != NULL );
2707 
2708  if (thread_data -> td.td_deque == NULL ) {
2709  // There's no queue in this thread, go find another one
2710  // We're guaranteed that at least one thread has a queue
2711  KA_TRACE(30, ("__kmp_give_task: thread %d has no queue while giving task %p.\n", tid, taskdata ) );
2712  return result;
2713  }
2714 
2715  if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
2716  {
2717  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
2718  return result;
2719  }
2720 
2721  __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
2722 
2723  if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
2724  {
2725  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
2726  goto release_and_exit;
2727  }
2728 
2729  thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;
2730  // Wrap index.
2731  thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
2732  TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);
2733 
2734  result = true;
2735  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", taskdata, tid ) );
2736 
2737 release_and_exit:
2738  __kmp_release_bootstrap_lock( & thread_data-> td.td_deque_lock );
2739 
2740  return result;
2741 }
2742 
2743 
2744 /* The finish of the a proxy tasks is divided in two pieces:
2745  - the top half is the one that can be done from a thread outside the team
2746  - the bottom half must be run from a them within the team
2747 
2748  In order to run the bottom half the task gets queued back into one of the threads of the team.
2749  Once the td_incomplete_child_task counter of the parent is decremented the threads can leave the barriers.
2750  So, the bottom half needs to be queued before the counter is decremented. The top half is therefore divided in two parts:
2751  - things that can be run before queuing the bottom half
2752  - things that must be run after queuing the bottom half
2753 
2754  This creates a second race as the bottom half can free the task before the second top half is executed. To avoid this
2755  we use the td_incomplete_child_task of the proxy task to synchronize the top and bottom half.
2756 */
2757 
2758 static void __kmp_first_top_half_finish_proxy( kmp_taskdata_t * taskdata )
2759 {
2760  KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
2761  KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
2762  KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
2763  KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
2764 
2765  taskdata -> td_flags.complete = 1; // mark the task as completed
2766 
2767  if ( taskdata->td_taskgroup )
2768  KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
2769 
2770  // Create an imaginary children for this task so the bottom half cannot release the task before we have completed the second top half
2771  TCR_4(taskdata->td_incomplete_child_tasks++);
2772 }
2773 
2774 static void __kmp_second_top_half_finish_proxy( kmp_taskdata_t * taskdata )
2775 {
2776  kmp_int32 children = 0;
2777 
2778  // Predecrement simulated by "- 1" calculation
2779  children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
2780  KMP_DEBUG_ASSERT( children >= 0 );
2781 
2782  // Remove the imaginary children
2783  TCR_4(taskdata->td_incomplete_child_tasks--);
2784 }
2785 
2786 static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask )
2787 {
2788  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
2789  kmp_info_t * thread = __kmp_threads[ gtid ];
2790 
2791  KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
2792  KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 1 ); // top half must run before bottom half
2793 
2794  // We need to wait to make sure the top half is finished
2795  // Spinning here should be ok as this should happen quickly
2796  while ( TCR_4(taskdata->td_incomplete_child_tasks) > 0 ) ;
2797 
2798  __kmp_release_deps(gtid,taskdata);
2799  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
2800 }
2801 
2809 void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask )
2810 {
2811  KMP_DEBUG_ASSERT( ptask != NULL );
2812  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
2813  KA_TRACE(10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", gtid, taskdata ) );
2814 
2815  KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
2816 
2817  __kmp_first_top_half_finish_proxy(taskdata);
2818  __kmp_second_top_half_finish_proxy(taskdata);
2819  __kmp_bottom_half_finish_proxy(gtid,ptask);
2820 
2821  KA_TRACE(10, ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", gtid, taskdata ) );
2822 }
2823 
2830 void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
2831 {
2832  KMP_DEBUG_ASSERT( ptask != NULL );
2833  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
2834 
2835  KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", taskdata ) );
2836 
2837  KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
2838 
2839  __kmp_first_top_half_finish_proxy(taskdata);
2840 
2841  // Enqueue task to complete bottom half completation from a thread within the corresponding team
2842  kmp_team_t * team = taskdata->td_team;
2843  kmp_int32 nthreads = team->t.t_nproc;
2844  kmp_info_t *thread;
2845  kmp_int32 k = 0;
2846 
2847  do {
2848  //This should be similar to k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
2849  //For now we're just linearly trying to find a thread
2850  k = (k+1) % nthreads;
2851  thread = team->t.t_threads[k];
2852  } while ( !__kmp_give_task( thread, k, ptask ) );
2853 
2854  __kmp_second_top_half_finish_proxy(taskdata);
2855 
2856  KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
2857 }
2858 
2859 #endif
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:632
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:657
Definition: kmp.h:200