LLVM OpenMP* Runtime Library
kmp_taskq.cpp
1 /*
2  * kmp_taskq.cpp -- TASKQ support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_error.h"
16 #include "kmp_i18n.h"
17 #include "kmp_io.h"
18 
19 #define MAX_MESSAGE 512
20 
21 /* Taskq routines and global variables */
22 
23 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x);
24 
25 #define THREAD_ALLOC_FOR_TASKQ
26 
27 static int in_parallel_context(kmp_team_t *team) {
28  return !team->t.t_serialized;
29 }
30 
31 static void __kmp_taskq_eo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
32  int gtid = *gtid_ref;
33  int tid = __kmp_tid_from_gtid(gtid);
34  kmp_uint32 my_token;
35  kmpc_task_queue_t *taskq;
36  kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
37 
38  if (__kmp_env_consistency_check)
39 #if KMP_USE_DYNAMIC_LOCK
40  __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL, 0);
41 #else
42  __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL);
43 #endif
44 
45  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
46  KMP_MB(); /* Flush all pending memory write invalidates. */
47 
48  /* GEH - need check here under stats to make sure */
49  /* inside task (curr_thunk[*tid_ref] != NULL) */
50 
51  my_token = tq->tq_curr_thunk[tid]->th_tasknum;
52 
53  taskq = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
54 
55  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
56  KMP_MB();
57  }
58 }
59 
60 static void __kmp_taskq_xo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  int gtid = *gtid_ref;
62  int tid = __kmp_tid_from_gtid(gtid);
63  kmp_uint32 my_token;
64  kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
65 
66  if (__kmp_env_consistency_check)
67  __kmp_pop_sync(gtid, ct_ordered_in_taskq, loc_ref);
68 
69  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
70  KMP_MB(); /* Flush all pending memory write invalidates. */
71 
72  /* GEH - need check here under stats to make sure */
73  /* inside task (curr_thunk[tid] != NULL) */
74 
75  my_token = tq->tq_curr_thunk[tid]->th_tasknum;
76 
77  KMP_MB(); /* Flush all pending memory write invalidates. */
78 
79  tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue->tq_tasknum_serving =
80  my_token + 1;
81 
82  KMP_MB(); /* Flush all pending memory write invalidates. */
83  }
84 }
85 
86 static void __kmp_taskq_check_ordered(kmp_int32 gtid, kmpc_thunk_t *thunk) {
87  kmp_uint32 my_token;
88  kmpc_task_queue_t *taskq;
89 
90  /* assume we are always called from an active parallel context */
91 
92  KMP_MB(); /* Flush all pending memory write invalidates. */
93 
94  my_token = thunk->th_tasknum;
95 
96  taskq = thunk->th.th_shareds->sv_queue;
97 
98  if (taskq->tq_tasknum_serving <= my_token) {
99  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
100  KMP_MB();
101  taskq->tq_tasknum_serving = my_token + 1;
102  KMP_MB();
103  }
104 }
105 
106 #ifdef KMP_DEBUG
107 
108 static void __kmp_dump_TQF(kmp_int32 flags) {
109  if (flags & TQF_IS_ORDERED)
110  __kmp_printf("ORDERED ");
111  if (flags & TQF_IS_LASTPRIVATE)
112  __kmp_printf("LAST_PRIV ");
113  if (flags & TQF_IS_NOWAIT)
114  __kmp_printf("NOWAIT ");
115  if (flags & TQF_HEURISTICS)
116  __kmp_printf("HEURIST ");
117  if (flags & TQF_INTERFACE_RESERVED1)
118  __kmp_printf("RESERV1 ");
119  if (flags & TQF_INTERFACE_RESERVED2)
120  __kmp_printf("RESERV2 ");
121  if (flags & TQF_INTERFACE_RESERVED3)
122  __kmp_printf("RESERV3 ");
123  if (flags & TQF_INTERFACE_RESERVED4)
124  __kmp_printf("RESERV4 ");
125  if (flags & TQF_IS_LAST_TASK)
126  __kmp_printf("LAST_TASK ");
127  if (flags & TQF_TASKQ_TASK)
128  __kmp_printf("TASKQ_TASK ");
129  if (flags & TQF_RELEASE_WORKERS)
130  __kmp_printf("RELEASE ");
131  if (flags & TQF_ALL_TASKS_QUEUED)
132  __kmp_printf("ALL_QUEUED ");
133  if (flags & TQF_PARALLEL_CONTEXT)
134  __kmp_printf("PARALLEL ");
135  if (flags & TQF_DEALLOCATED)
136  __kmp_printf("DEALLOC ");
137  if (!(flags & (TQF_INTERNAL_FLAGS | TQF_INTERFACE_FLAGS)))
138  __kmp_printf("(NONE)");
139 }
140 
141 static void __kmp_dump_thunk(kmp_taskq_t *tq, kmpc_thunk_t *thunk,
142  kmp_int32 global_tid) {
143  int i;
144  int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
145 
146  __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid);
147 
148  if (thunk != NULL) {
149  for (i = 0; i < nproc; i++) {
150  if (tq->tq_curr_thunk[i] == thunk) {
151  __kmp_printf("[%i] ", i);
152  }
153  }
154  __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
155  __kmp_printf("th_task=%p, ", thunk->th_task);
156  __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
157  __kmp_printf("th_status=%d, ", thunk->th_status);
158  __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
159  __kmp_printf("th_flags=");
160  __kmp_dump_TQF(thunk->th_flags);
161  }
162 
163  __kmp_printf("\n");
164 }
165 
166 static void __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num) {
167  kmpc_thunk_t *th;
168 
169  __kmp_printf(" Thunk stack for T#%d: ", thread_num);
170 
171  for (th = thunk; th != NULL; th = th->th_encl_thunk)
172  __kmp_printf("%p ", th);
173 
174  __kmp_printf("\n");
175 }
176 
177 static void __kmp_dump_task_queue(kmp_taskq_t *tq, kmpc_task_queue_t *queue,
178  kmp_int32 global_tid) {
179  int qs, count, i;
180  kmpc_thunk_t *thunk;
181  kmpc_task_queue_t *taskq;
182 
183  __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
184 
185  if (queue != NULL) {
186  int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
187 
188  if (__kmp_env_consistency_check) {
189  __kmp_printf(" tq_loc : ");
190  }
191  if (in_parallel) {
192 
193  // if (queue->tq.tq_parent != 0)
194  //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
195 
196  //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
197 
198  // Make sure data structures are in consistent state before querying them
199  // Seems to work without this for digital/alpha, needed for IBM/RS6000
200  KMP_MB();
201 
202  __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent);
203  __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child);
204  __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child);
205  __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child);
206  __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count);
207 
208  //__kmp_release_lock(& queue->tq_link_lck, global_tid);
209 
210  // if (queue->tq.tq_parent != 0)
211  //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
212 
213  //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
214  //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
215 
216  // Make sure data structures are in consistent state before querying them
217  // Seems to work without this for digital/alpha, needed for IBM/RS6000
218  KMP_MB();
219  }
220 
221  __kmp_printf(" tq_shareds : ");
222  for (i = 0; i < ((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
223  __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
224  __kmp_printf("\n");
225 
226  if (in_parallel) {
227  __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
228  __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
229  }
230 
231  __kmp_printf(" tq_queue : %p\n", queue->tq_queue);
232  __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space);
233  __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot);
234 
235  __kmp_printf(" tq_free_thunks : ");
236  for (thunk = queue->tq_free_thunks; thunk != NULL;
237  thunk = thunk->th.th_next_free)
238  __kmp_printf("%p ", thunk);
239  __kmp_printf("\n");
240 
241  __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots);
242  __kmp_printf(" tq_head : %d\n", queue->tq_head);
243  __kmp_printf(" tq_tail : %d\n", queue->tq_tail);
244  __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull);
245  __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat);
246  __kmp_printf(" tq_flags : ");
247  __kmp_dump_TQF(queue->tq_flags);
248  __kmp_printf("\n");
249 
250  if (in_parallel) {
251  __kmp_printf(" tq_th_thunks : ");
252  for (i = 0; i < queue->tq_nproc; i++) {
253  __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
254  }
255  __kmp_printf("\n");
256  }
257 
258  __kmp_printf("\n");
259  __kmp_printf(" Queue slots:\n");
260 
261  qs = queue->tq_tail;
262  for (count = 0; count < queue->tq_nfull; ++count) {
263  __kmp_printf("(%d)", qs);
264  __kmp_dump_thunk(tq, queue->tq_queue[qs].qs_thunk, global_tid);
265  qs = (qs + 1) % queue->tq_nslots;
266  }
267 
268  __kmp_printf("\n");
269 
270  if (in_parallel) {
271  if (queue->tq_taskq_slot != NULL) {
272  __kmp_printf(" TaskQ slot:\n");
273  __kmp_dump_thunk(tq, CCAST(kmpc_thunk_t *, queue->tq_taskq_slot),
274  global_tid);
275  __kmp_printf("\n");
276  }
277  //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
278  //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
279  }
280  }
281 
282  __kmp_printf(" Taskq freelist: ");
283 
284  //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
285 
286  // Make sure data structures are in consistent state before querying them
287  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
288  KMP_MB();
289 
290  for (taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free)
291  __kmp_printf("%p ", taskq);
292 
293  //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
294 
295  __kmp_printf("\n\n");
296 }
297 
298 static void __kmp_aux_dump_task_queue_tree(kmp_taskq_t *tq,
299  kmpc_task_queue_t *curr_queue,
300  kmp_int32 level,
301  kmp_int32 global_tid) {
302  int i, count, qs;
303  int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
304  kmpc_task_queue_t *queue = curr_queue;
305 
306  if (curr_queue == NULL)
307  return;
308 
309  __kmp_printf(" ");
310 
311  for (i = 0; i < level; i++)
312  __kmp_printf(" ");
313 
314  __kmp_printf("%p", curr_queue);
315 
316  for (i = 0; i < nproc; i++) {
317  if (tq->tq_curr_thunk[i] &&
318  tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue) {
319  __kmp_printf(" [%i]", i);
320  }
321  }
322 
323  __kmp_printf(":");
324 
325  //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
326 
327  // Make sure data structures are in consistent state before querying them
328  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
329  KMP_MB();
330 
331  qs = curr_queue->tq_tail;
332 
333  for (count = 0; count < curr_queue->tq_nfull; ++count) {
334  __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
335  qs = (qs + 1) % curr_queue->tq_nslots;
336  }
337 
338  //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
339 
340  __kmp_printf("\n");
341 
342  if (curr_queue->tq_first_child) {
343  //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
344 
345  // Make sure data structures are in consistent state before querying them
346  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
347  KMP_MB();
348 
349  if (curr_queue->tq_first_child) {
350  for (queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
351  queue != NULL; queue = queue->tq_next_child) {
352  __kmp_aux_dump_task_queue_tree(tq, queue, level + 1, global_tid);
353  }
354  }
355 
356  //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
357  }
358 }
359 
360 static void __kmp_dump_task_queue_tree(kmp_taskq_t *tq,
361  kmpc_task_queue_t *tqroot,
362  kmp_int32 global_tid) {
363  __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
364 
365  __kmp_aux_dump_task_queue_tree(tq, tqroot, 0, global_tid);
366 
367  __kmp_printf("\n");
368 }
369 #endif
370 
371 /* New taskq storage routines that try to minimize overhead of mallocs but
372  still provide cache line alignment. */
373 static void *__kmp_taskq_allocate(size_t size, kmp_int32 global_tid) {
374  void *addr, *orig_addr;
375  size_t bytes;
376 
377  KB_TRACE(5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int)size,
378  global_tid));
379 
380  bytes = sizeof(void *) + CACHE_LINE + size;
381 
382 #ifdef THREAD_ALLOC_FOR_TASKQ
383  orig_addr =
384  (void *)__kmp_thread_malloc(__kmp_thread_from_gtid(global_tid), bytes);
385 #else
386  KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", bytes));
387  orig_addr = (void *)KMP_INTERNAL_MALLOC(bytes);
388 #endif /* THREAD_ALLOC_FOR_TASKQ */
389 
390  if (orig_addr == 0)
391  KMP_FATAL(OutOfHeapMemory);
392 
393  addr = orig_addr;
394 
395  if (((kmp_uintptr_t)addr & (CACHE_LINE - 1)) != 0) {
396  KB_TRACE(50, ("__kmp_taskq_allocate: adjust for cache alignment\n"));
397  addr = (void *)(((kmp_uintptr_t)addr + CACHE_LINE) & ~(CACHE_LINE - 1));
398  }
399 
400  (*(void **)addr) = orig_addr;
401 
402  KB_TRACE(10,
403  ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, "
404  "gtid: %d\n",
405  orig_addr, ((void **)addr) + 1,
406  ((char *)(((void **)addr) + 1)) + size - 1, (int)size, global_tid));
407 
408  return (((void **)addr) + 1);
409 }
410 
411 static void __kmpc_taskq_free(void *p, kmp_int32 global_tid) {
412  KB_TRACE(5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid));
413 
414  KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n",
415  (*(((void **)p) - 1)), global_tid));
416 
417 #ifdef THREAD_ALLOC_FOR_TASKQ
418  __kmp_thread_free(__kmp_thread_from_gtid(global_tid), *(((void **)p) - 1));
419 #else
420  KMP_INTERNAL_FREE(*(((void **)p) - 1));
421 #endif /* THREAD_ALLOC_FOR_TASKQ */
422 }
423 
424 /* Keep freed kmpc_task_queue_t on an internal freelist and recycle since
425  they're of constant size. */
426 
427 static kmpc_task_queue_t *
428 __kmp_alloc_taskq(kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots,
429  kmp_int32 nthunks, kmp_int32 nshareds, kmp_int32 nproc,
430  size_t sizeof_thunk, size_t sizeof_shareds,
431  kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid) {
432  kmp_int32 i;
433  size_t bytes;
434  kmpc_task_queue_t *new_queue;
435  kmpc_aligned_shared_vars_t *shared_var_array;
436  char *shared_var_storage;
437  char *pt; /* for doing byte-adjusted address computations */
438 
439  __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
440 
441  // Make sure data structures are in consistent state before querying them
442  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
443  KMP_MB();
444 
445  if (tq->tq_freelist) {
446  new_queue = tq->tq_freelist;
447  tq->tq_freelist = tq->tq_freelist->tq.tq_next_free;
448 
449  KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
450 
451  new_queue->tq_flags = 0;
452 
453  __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
454  } else {
455  __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
456 
457  new_queue = (kmpc_task_queue_t *)__kmp_taskq_allocate(
458  sizeof(kmpc_task_queue_t), global_tid);
459  new_queue->tq_flags = 0;
460  }
461 
462  /* space in the task queue for queue slots (allocate as one big chunk */
463  /* of storage including new_taskq_task space) */
464 
465  sizeof_thunk +=
466  (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */
467  pt = (char *)__kmp_taskq_allocate(nthunks * sizeof_thunk, global_tid);
468  new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
469  *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
470 
471  /* chain the allocated thunks into a freelist for this queue */
472 
473  new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
474 
475  for (i = 0; i < (nthunks - 2); i++) {
476  ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th.th_next_free =
477  (kmpc_thunk_t *)(pt + (i + 1) * sizeof_thunk);
478 #ifdef KMP_DEBUG
479  ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th_flags = TQF_DEALLOCATED;
480 #endif
481  }
482 
483  ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th.th_next_free = NULL;
484 #ifdef KMP_DEBUG
485  ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th_flags =
486  TQF_DEALLOCATED;
487 #endif
488 
489  /* initialize the locks */
490 
491  if (in_parallel) {
492  __kmp_init_lock(&new_queue->tq_link_lck);
493  __kmp_init_lock(&new_queue->tq_free_thunks_lck);
494  __kmp_init_lock(&new_queue->tq_queue_lck);
495  }
496 
497  /* now allocate the slots */
498 
499  bytes = nslots * sizeof(kmpc_aligned_queue_slot_t);
500  new_queue->tq_queue =
501  (kmpc_aligned_queue_slot_t *)__kmp_taskq_allocate(bytes, global_tid);
502 
503  /* space for array of pointers to shared variable structures */
504  sizeof_shareds += sizeof(kmpc_task_queue_t *);
505  sizeof_shareds +=
506  (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */
507 
508  bytes = nshareds * sizeof(kmpc_aligned_shared_vars_t);
509  shared_var_array =
510  (kmpc_aligned_shared_vars_t *)__kmp_taskq_allocate(bytes, global_tid);
511 
512  bytes = nshareds * sizeof_shareds;
513  shared_var_storage = (char *)__kmp_taskq_allocate(bytes, global_tid);
514 
515  for (i = 0; i < nshareds; i++) {
516  shared_var_array[i].ai_data =
517  (kmpc_shared_vars_t *)(shared_var_storage + i * sizeof_shareds);
518  shared_var_array[i].ai_data->sv_queue = new_queue;
519  }
520  new_queue->tq_shareds = shared_var_array;
521 
522  /* array for number of outstanding thunks per thread */
523 
524  if (in_parallel) {
525  bytes = nproc * sizeof(kmpc_aligned_int32_t);
526  new_queue->tq_th_thunks =
527  (kmpc_aligned_int32_t *)__kmp_taskq_allocate(bytes, global_tid);
528  new_queue->tq_nproc = nproc;
529 
530  for (i = 0; i < nproc; i++)
531  new_queue->tq_th_thunks[i].ai_data = 0;
532  }
533 
534  return new_queue;
535 }
536 
537 static void __kmp_free_taskq(kmp_taskq_t *tq, kmpc_task_queue_t *p,
538  int in_parallel, kmp_int32 global_tid) {
539  __kmpc_taskq_free(p->tq_thunk_space, global_tid);
540  __kmpc_taskq_free(p->tq_queue, global_tid);
541 
542  /* free shared var structure storage */
543  __kmpc_taskq_free(CCAST(kmpc_shared_vars_t *, p->tq_shareds[0].ai_data),
544  global_tid);
545  /* free array of pointers to shared vars storage */
546  __kmpc_taskq_free(p->tq_shareds, global_tid);
547 
548 #ifdef KMP_DEBUG
549  p->tq_first_child = NULL;
550  p->tq_next_child = NULL;
551  p->tq_prev_child = NULL;
552  p->tq_ref_count = -10;
553  p->tq_shareds = NULL;
554  p->tq_tasknum_queuing = 0;
555  p->tq_tasknum_serving = 0;
556  p->tq_queue = NULL;
557  p->tq_thunk_space = NULL;
558  p->tq_taskq_slot = NULL;
559  p->tq_free_thunks = NULL;
560  p->tq_nslots = 0;
561  p->tq_head = 0;
562  p->tq_tail = 0;
563  p->tq_nfull = 0;
564  p->tq_hiwat = 0;
565 
566  if (in_parallel) {
567  int i;
568 
569  for (i = 0; i < p->tq_nproc; i++)
570  p->tq_th_thunks[i].ai_data = 0;
571  }
572  if (__kmp_env_consistency_check)
573  p->tq_loc = NULL;
574  KMP_DEBUG_ASSERT(p->tq_flags & TQF_DEALLOCATED);
575  p->tq_flags = TQF_DEALLOCATED;
576 #endif /* KMP_DEBUG */
577 
578  if (in_parallel) {
579  __kmpc_taskq_free(p->tq_th_thunks, global_tid);
580  __kmp_destroy_lock(&p->tq_link_lck);
581  __kmp_destroy_lock(&p->tq_queue_lck);
582  __kmp_destroy_lock(&p->tq_free_thunks_lck);
583  }
584 #ifdef KMP_DEBUG
585  p->tq_th_thunks = NULL;
586 #endif /* KMP_DEBUG */
587 
588  // Make sure data structures are in consistent state before querying them
589  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
590  KMP_MB();
591 
592  __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
593  p->tq.tq_next_free = tq->tq_freelist;
594 
595  tq->tq_freelist = p;
596  __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
597 }
598 
599 /* Once a group of thunks has been allocated for use in a particular queue,
600  these are managed via a per-queue freelist.
601  We force a check that there's always a thunk free if we need one. */
602 
603 static kmpc_thunk_t *__kmp_alloc_thunk(kmpc_task_queue_t *queue,
604  int in_parallel, kmp_int32 global_tid) {
605  kmpc_thunk_t *fl;
606 
607  if (in_parallel) {
608  __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
609  // Make sure data structures are in consistent state before querying them
610  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
611  KMP_MB();
612  }
613 
614  fl = queue->tq_free_thunks;
615 
616  KMP_DEBUG_ASSERT(fl != NULL);
617 
618  queue->tq_free_thunks = fl->th.th_next_free;
619  fl->th_flags = 0;
620 
621  if (in_parallel)
622  __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
623 
624  return fl;
625 }
626 
627 static void __kmp_free_thunk(kmpc_task_queue_t *queue, kmpc_thunk_t *p,
628  int in_parallel, kmp_int32 global_tid) {
629 #ifdef KMP_DEBUG
630  p->th_task = 0;
631  p->th_encl_thunk = 0;
632  p->th_status = 0;
633  p->th_tasknum = 0;
634 /* Also could zero pointers to private vars */
635 #endif
636 
637  if (in_parallel) {
638  __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
639  // Make sure data structures are in consistent state before querying them
640  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
641  KMP_MB();
642  }
643 
644  p->th.th_next_free = queue->tq_free_thunks;
645  queue->tq_free_thunks = p;
646 
647 #ifdef KMP_DEBUG
648  p->th_flags = TQF_DEALLOCATED;
649 #endif
650 
651  if (in_parallel)
652  __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
653 }
654 
655 /* returns nonzero if the queue just became full after the enqueue */
656 static kmp_int32 __kmp_enqueue_task(kmp_taskq_t *tq, kmp_int32 global_tid,
657  kmpc_task_queue_t *queue,
658  kmpc_thunk_t *thunk, int in_parallel) {
659  kmp_int32 ret;
660 
661  /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the
662  * master is executing then) */
663  if (in_parallel) {
664  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
665  // Make sure data structures are in consistent state before querying them
666  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
667  KMP_MB();
668  }
669 
670  KMP_DEBUG_ASSERT(queue->tq_nfull < queue->tq_nslots); // check queue not full
671 
672  queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
673 
674  if (queue->tq_head >= queue->tq_nslots)
675  queue->tq_head = 0;
676 
677  (queue->tq_nfull)++;
678 
679  KMP_MB(); /* to assure that nfull is seen to increase before
680  TQF_ALL_TASKS_QUEUED is set */
681 
682  ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
683 
684  if (in_parallel) {
685  /* don't need to wait until workers are released before unlocking */
686  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
687 
688  if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
689  // If just creating the root queue, the worker threads are waiting at a
690  // join barrier until now, when there's something in the queue for them to
691  // do; release them now to do work. This should only be done when this is
692  // the first task enqueued, so reset the flag here also.
693  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers
694  are still in spin mode */
695  // avoid releasing barrier twice if taskq_task switches threads
696  KMP_MB();
697 
698  __kmpc_end_barrier_master(NULL, global_tid);
699  }
700  }
701 
702  return ret;
703 }
704 
705 static kmpc_thunk_t *__kmp_dequeue_task(kmp_int32 global_tid,
706  kmpc_task_queue_t *queue,
707  int in_parallel) {
708  kmpc_thunk_t *pt;
709  int tid = __kmp_tid_from_gtid(global_tid);
710 
711  KMP_DEBUG_ASSERT(queue->tq_nfull > 0); /* check queue not empty */
712 
713  if (queue->tq.tq_parent != NULL && in_parallel) {
714  int ct;
715  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
716  ct = ++(queue->tq_ref_count);
717  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
718  KMP_DEBUG_REF_CTS(
719  ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
720  }
721 
722  pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
723 
724  if (queue->tq_tail >= queue->tq_nslots)
725  queue->tq_tail = 0;
726 
727  if (in_parallel) {
728  queue->tq_th_thunks[tid].ai_data++;
729 
730  KMP_MB(); /* necessary so ai_data increment is propagated to other threads
731  immediately (digital) */
732 
733  KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding "
734  "thunks from queue %p\n",
735  global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
736  }
737 
738  (queue->tq_nfull)--;
739 
740 #ifdef KMP_DEBUG
741  KMP_MB();
742 
743  /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is
744  * decremented */
745 
746  KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
747 
748  if (in_parallel) {
749  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <=
750  __KMP_TASKQ_THUNKS_PER_TH);
751  }
752 #endif
753 
754  return pt;
755 }
756 
757 /* Find the next (non-null) task to dequeue and return it.
758  * This is never called unless in_parallel=TRUE
759  *
760  * Here are the rules for deciding which queue to take the task from:
761  * 1. Walk up the task queue tree from the current queue's parent and look
762  * on the way up (for loop, below).
763  * 2. Do a depth-first search back down the tree from the root and
764  * look (find_task_in_descendant_queue()).
765  *
766  * Here are the rules for deciding which task to take from a queue
767  * (__kmp_find_task_in_queue ()):
768  * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
769  * must be staged to make sure we execute the last one with
770  * TQF_IS_LAST_TASK at the end of task queue execution.
771  * 2. If the queue length is below some high water mark and the taskq task
772  * is enqueued, prefer running the taskq task.
773  * 3. Otherwise, take a (normal) task from the queue.
774  *
775  * If we do all this and return pt == NULL at the bottom of this routine,
776  * this means there are no more tasks to execute (except possibly for
777  * TQF_IS_LASTPRIVATE).
778  */
779 
780 static kmpc_thunk_t *__kmp_find_task_in_queue(kmp_int32 global_tid,
781  kmpc_task_queue_t *queue) {
782  kmpc_thunk_t *pt = NULL;
783  int tid = __kmp_tid_from_gtid(global_tid);
784 
785  /* To prevent deadlock from tq_queue_lck if queue already deallocated */
786  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
787 
788  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
789 
790  /* Check again to avoid race in __kmpc_end_taskq() */
791  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
792  // Make sure data structures are in consistent state before querying them
793  // Seems to work without this for digital/alpha, needed for IBM/RS6000
794  KMP_MB();
795 
796  if ((queue->tq_taskq_slot != NULL) &&
797  (queue->tq_nfull <= queue->tq_hiwat)) {
798  /* if there's enough room in the queue and the dispatcher */
799  /* (taskq task) is available, schedule more tasks */
800  pt = CCAST(kmpc_thunk_t *, queue->tq_taskq_slot);
801  queue->tq_taskq_slot = NULL;
802  } else if (queue->tq_nfull == 0 ||
803  queue->tq_th_thunks[tid].ai_data >=
804  __KMP_TASKQ_THUNKS_PER_TH) {
805  /* do nothing if no thunks available or this thread can't */
806  /* run any because it already is executing too many */
807  pt = NULL;
808  } else if (queue->tq_nfull > 1) {
809  /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */
810 
811  pt = __kmp_dequeue_task(global_tid, queue, TRUE);
812  } else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
813  // one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE
814  pt = __kmp_dequeue_task(global_tid, queue, TRUE);
815  } else if (queue->tq_flags & TQF_IS_LAST_TASK) {
816  /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */
817  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
818  /* instrumentation does copy-out. */
819  pt = __kmp_dequeue_task(global_tid, queue, TRUE);
820  pt->th_flags |=
821  TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */
822  }
823  }
824 
825  /* GEH - What happens here if is lastprivate, but not last task? */
826  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
827  }
828 
829  return pt;
830 }
831 
832 /* Walk a tree of queues starting at queue's first child and return a non-NULL
833  thunk if one can be scheduled. Must only be called when in_parallel=TRUE */
834 
835 static kmpc_thunk_t *
836 __kmp_find_task_in_descendant_queue(kmp_int32 global_tid,
837  kmpc_task_queue_t *curr_queue) {
838  kmpc_thunk_t *pt = NULL;
839  kmpc_task_queue_t *queue = curr_queue;
840 
841  if (curr_queue->tq_first_child != NULL) {
842  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
843  // Make sure data structures are in consistent state before querying them
844  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
845  KMP_MB();
846 
847  queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
848  if (queue == NULL) {
849  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
850  return NULL;
851  }
852 
853  while (queue != NULL) {
854  int ct;
855  kmpc_task_queue_t *next;
856 
857  ct = ++(queue->tq_ref_count);
858  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
859  KMP_DEBUG_REF_CTS(
860  ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
861 
862  pt = __kmp_find_task_in_queue(global_tid, queue);
863 
864  if (pt != NULL) {
865  int ct;
866 
867  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
868  // Make sure data structures in consistent state before querying them
869  // Seems to work without this for digital/alpha, needed for IBM/RS6000
870  KMP_MB();
871 
872  ct = --(queue->tq_ref_count);
873  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
874  global_tid, queue, ct));
875  KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
876 
877  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
878 
879  return pt;
880  }
881 
882  /* although reference count stays active during descendant walk, shouldn't
883  matter since if children still exist, reference counts aren't being
884  monitored anyway */
885 
886  pt = __kmp_find_task_in_descendant_queue(global_tid, queue);
887 
888  if (pt != NULL) {
889  int ct;
890 
891  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
892  // Make sure data structures in consistent state before querying them
893  // Seems to work without this for digital/alpha, needed for IBM/RS6000
894  KMP_MB();
895 
896  ct = --(queue->tq_ref_count);
897  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
898  global_tid, queue, ct));
899  KMP_DEBUG_ASSERT(ct >= 0);
900 
901  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
902 
903  return pt;
904  }
905 
906  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
907  // Make sure data structures in consistent state before querying them
908  // Seems to work without this for digital/alpha, needed for IBM/RS6000
909  KMP_MB();
910 
911  next = queue->tq_next_child;
912 
913  ct = --(queue->tq_ref_count);
914  KMP_DEBUG_REF_CTS(
915  ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
916  KMP_DEBUG_ASSERT(ct >= 0);
917 
918  queue = next;
919  }
920 
921  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
922  }
923 
924  return pt;
925 }
926 
927 /* Walk up the taskq tree looking for a task to execute. If we get to the root,
928  search the tree for a descendent queue task. Must only be called when
929  in_parallel=TRUE */
930 static kmpc_thunk_t *
931 __kmp_find_task_in_ancestor_queue(kmp_taskq_t *tq, kmp_int32 global_tid,
932  kmpc_task_queue_t *curr_queue) {
933  kmpc_task_queue_t *queue;
934  kmpc_thunk_t *pt;
935 
936  pt = NULL;
937 
938  if (curr_queue->tq.tq_parent != NULL) {
939  queue = curr_queue->tq.tq_parent;
940 
941  while (queue != NULL) {
942  if (queue->tq.tq_parent != NULL) {
943  int ct;
944  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
945  // Make sure data structures in consistent state before querying them
946  // Seems to work without this for digital/alpha, needed for IBM/RS6000
947  KMP_MB();
948 
949  ct = ++(queue->tq_ref_count);
950  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
951  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n", __LINE__,
952  global_tid, queue, ct));
953  }
954 
955  pt = __kmp_find_task_in_queue(global_tid, queue);
956  if (pt != NULL) {
957  if (queue->tq.tq_parent != NULL) {
958  int ct;
959  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
960  // Make sure data structures in consistent state before querying them
961  // Seems to work without this for digital/alpha, needed for IBM/RS6000
962  KMP_MB();
963 
964  ct = --(queue->tq_ref_count);
965  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
966  global_tid, queue, ct));
967  KMP_DEBUG_ASSERT(ct >= 0);
968 
969  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
970  }
971 
972  return pt;
973  }
974 
975  if (queue->tq.tq_parent != NULL) {
976  int ct;
977  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
978  // Make sure data structures in consistent state before querying them
979  // Seems to work without this for digital/alpha, needed for IBM/RS6000
980  KMP_MB();
981 
982  ct = --(queue->tq_ref_count);
983  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
984  global_tid, queue, ct));
985  KMP_DEBUG_ASSERT(ct >= 0);
986  }
987  queue = queue->tq.tq_parent;
988 
989  if (queue != NULL)
990  __kmp_release_lock(&queue->tq_link_lck, global_tid);
991  }
992  }
993 
994  pt = __kmp_find_task_in_descendant_queue(global_tid, tq->tq_root);
995 
996  return pt;
997 }
998 
999 static int __kmp_taskq_tasks_finished(kmpc_task_queue_t *queue) {
1000  int i;
1001 
1002  /* KMP_MB(); */ /* is this really necessary? */
1003 
1004  for (i = 0; i < queue->tq_nproc; i++) {
1005  if (queue->tq_th_thunks[i].ai_data != 0)
1006  return FALSE;
1007  }
1008 
1009  return TRUE;
1010 }
1011 
1012 static int __kmp_taskq_has_any_children(kmpc_task_queue_t *queue) {
1013  return (queue->tq_first_child != NULL);
1014 }
1015 
1016 static void __kmp_remove_queue_from_tree(kmp_taskq_t *tq, kmp_int32 global_tid,
1017  kmpc_task_queue_t *queue,
1018  int in_parallel) {
1019 #ifdef KMP_DEBUG
1020  kmp_int32 i;
1021  kmpc_thunk_t *thunk;
1022 #endif
1023 
1024  KF_TRACE(50,
1025  ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1026  KF_DUMP(50, __kmp_dump_task_queue(tq, queue, global_tid));
1027 
1028  /* sub-queue in a recursion, not the root task queue */
1029  KMP_DEBUG_ASSERT(queue->tq.tq_parent != NULL);
1030 
1031  if (in_parallel) {
1032  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1033  // Make sure data structures are in consistent state before querying them
1034  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1035  KMP_MB();
1036  }
1037 
1038  KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1039 
1040  /* unlink queue from its siblings if any at this level */
1041  if (queue->tq_prev_child != NULL)
1042  queue->tq_prev_child->tq_next_child = queue->tq_next_child;
1043  if (queue->tq_next_child != NULL)
1044  queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
1045  if (queue->tq.tq_parent->tq_first_child == queue)
1046  queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
1047 
1048  queue->tq_prev_child = NULL;
1049  queue->tq_next_child = NULL;
1050 
1051  if (in_parallel) {
1052  KMP_DEBUG_REF_CTS(
1053  ("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
1054  __LINE__, global_tid, queue, queue->tq_ref_count));
1055 
1056  /* wait until all other threads have stopped accessing this queue */
1057  while (queue->tq_ref_count > 1) {
1058  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1059 
1060  KMP_WAIT_YIELD((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE,
1061  NULL);
1062 
1063  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1064  // Make sure data structures are in consistent state before querying them
1065  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1066  KMP_MB();
1067  }
1068 
1069  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1070  }
1071 
1072  KMP_DEBUG_REF_CTS(
1073  ("line %d gtid %d: Q %p freeing queue\n", __LINE__, global_tid, queue));
1074 
1075 #ifdef KMP_DEBUG
1076  KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
1077  KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
1078 
1079  for (i = 0; i < queue->tq_nproc; i++) {
1080  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1081  }
1082 
1083  i = 0;
1084  for (thunk = queue->tq_free_thunks; thunk != NULL;
1085  thunk = thunk->th.th_next_free)
1086  ++i;
1087 
1088  KMP_ASSERT(i ==
1089  queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
1090 #endif
1091 
1092  /* release storage for queue entry */
1093  __kmp_free_taskq(tq, queue, TRUE, global_tid);
1094 
1095  KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1096  KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1097 }
1098 
1099 /* Starting from indicated queue, proceed downward through tree and remove all
1100  taskqs which are finished, but only go down to taskqs which have the "nowait"
1101  clause present. Assume this is only called when in_parallel=TRUE. */
1102 
1103 static void __kmp_find_and_remove_finished_child_taskq(
1104  kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue) {
1105  kmpc_task_queue_t *queue = curr_queue;
1106 
1107  if (curr_queue->tq_first_child != NULL) {
1108  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1109  // Make sure data structures are in consistent state before querying them
1110  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1111  KMP_MB();
1112 
1113  queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
1114  if (queue != NULL) {
1115  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1116  return;
1117  }
1118 
1119  while (queue != NULL) {
1120  kmpc_task_queue_t *next;
1121  int ct = ++(queue->tq_ref_count);
1122  KMP_DEBUG_REF_CTS(
1123  ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
1124 
1125  /* although reference count stays active during descendant walk, */
1126  /* shouldn't matter since if children still exist, reference */
1127  /* counts aren't being monitored anyway */
1128 
1129  if (queue->tq_flags & TQF_IS_NOWAIT) {
1130  __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1131 
1132  if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) &&
1133  (queue->tq_nfull == 0) && __kmp_taskq_tasks_finished(queue) &&
1134  !__kmp_taskq_has_any_children(queue)) {
1135 
1136  /* Only remove this if we have not already marked it for deallocation.
1137  This should prevent multiple threads from trying to free this. */
1138 
1139  if (__kmp_test_lock(&queue->tq_queue_lck, global_tid)) {
1140  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1141  queue->tq_flags |= TQF_DEALLOCATED;
1142  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1143 
1144  __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1145 
1146  /* Can't do any more here since can't be sure where sibling queue
1147  * is so just exit this level */
1148  return;
1149  } else {
1150  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1151  }
1152  }
1153  /* otherwise, just fall through and decrement reference count */
1154  }
1155  }
1156 
1157  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1158  // Make sure data structures are in consistent state before querying them
1159  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1160  KMP_MB();
1161 
1162  next = queue->tq_next_child;
1163 
1164  ct = --(queue->tq_ref_count);
1165  KMP_DEBUG_REF_CTS(
1166  ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1167  KMP_DEBUG_ASSERT(ct >= 0);
1168 
1169  queue = next;
1170  }
1171 
1172  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1173  }
1174 }
1175 
1176 /* Starting from indicated queue, proceed downward through tree and remove all
1177  taskq's assuming all are finished and assuming NO other threads are executing
1178  at this point. */
1179 static void __kmp_remove_all_child_taskq(kmp_taskq_t *tq, kmp_int32 global_tid,
1180  kmpc_task_queue_t *queue) {
1181  kmpc_task_queue_t *next_child;
1182 
1183  queue = CCAST(kmpc_task_queue_t *, queue->tq_first_child);
1184 
1185  while (queue != NULL) {
1186  __kmp_remove_all_child_taskq(tq, global_tid, queue);
1187 
1188  next_child = queue->tq_next_child;
1189  queue->tq_flags |= TQF_DEALLOCATED;
1190  __kmp_remove_queue_from_tree(tq, global_tid, queue, FALSE);
1191  queue = next_child;
1192  }
1193 }
1194 
1195 static void __kmp_execute_task_from_queue(kmp_taskq_t *tq, ident_t *loc,
1196  kmp_int32 global_tid,
1197  kmpc_thunk_t *thunk,
1198  int in_parallel) {
1199  kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
1200  kmp_int32 tid = __kmp_tid_from_gtid(global_tid);
1201 
1202  KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
1203  KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1204  KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
1205  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1206 
1207  /* For the taskq task, the curr_thunk pushes and pop pairs are set up as
1208  * follows:
1209  *
1210  * happens exactly once:
1211  * 1) __kmpc_taskq : push (if returning thunk only)
1212  * 4) __kmpc_end_taskq_task : pop
1213  *
1214  * optionally happens *each* time taskq task is dequeued/enqueued:
1215  * 2) __kmpc_taskq_task : pop
1216  * 3) __kmp_execute_task_from_queue : push
1217  *
1218  * execution ordering: 1,(2,3)*,4
1219  */
1220 
1221  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1222  kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
1223  thunk->th.th_shareds =
1224  CCAST(kmpc_shared_vars_t *, queue->tq_shareds[index].ai_data);
1225 
1226  if (__kmp_env_consistency_check) {
1227  __kmp_push_workshare(global_tid,
1228  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1229  : ct_task,
1230  queue->tq_loc);
1231  }
1232  } else {
1233  if (__kmp_env_consistency_check)
1234  __kmp_push_workshare(global_tid, ct_taskq, queue->tq_loc);
1235  }
1236 
1237  if (in_parallel) {
1238  thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1239  tq->tq_curr_thunk[tid] = thunk;
1240 
1241  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1242  }
1243 
1244  KF_TRACE(50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk,
1245  queue, global_tid));
1246  thunk->th_task(global_tid, thunk);
1247  KF_TRACE(50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue,
1248  global_tid));
1249 
1250  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1251  if (__kmp_env_consistency_check)
1252  __kmp_pop_workshare(global_tid,
1253  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1254  : ct_task,
1255  queue->tq_loc);
1256 
1257  if (in_parallel) {
1258  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1259  thunk->th_encl_thunk = NULL;
1260  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1261  }
1262 
1263  if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
1264  __kmp_taskq_check_ordered(global_tid, thunk);
1265  }
1266 
1267  __kmp_free_thunk(queue, thunk, in_parallel, global_tid);
1268 
1269  KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n",
1270  global_tid, thunk));
1271  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1272 
1273  if (in_parallel) {
1274  KMP_MB(); /* needed so thunk put on free list before outstanding thunk
1275  count is decremented */
1276 
1277  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
1278 
1279  KF_TRACE(
1280  200,
1281  ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
1282  global_tid, queue->tq_th_thunks[tid].ai_data - 1, queue));
1283 
1284  queue->tq_th_thunks[tid].ai_data--;
1285 
1286  /* KMP_MB(); */ /* is MB really necessary ? */
1287  }
1288 
1289  if (queue->tq.tq_parent != NULL && in_parallel) {
1290  int ct;
1291  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1292  ct = --(queue->tq_ref_count);
1293  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1294  KMP_DEBUG_REF_CTS(
1295  ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1296  KMP_DEBUG_ASSERT(ct >= 0);
1297  }
1298  }
1299 }
1300 
1301 /* starts a taskq; creates and returns a thunk for the taskq_task */
1302 /* also, returns pointer to shared vars for this thread in "shareds" arg */
1303 kmpc_thunk_t *__kmpc_taskq(ident_t *loc, kmp_int32 global_tid,
1304  kmpc_task_t taskq_task, size_t sizeof_thunk,
1305  size_t sizeof_shareds, kmp_int32 flags,
1306  kmpc_shared_vars_t **shareds) {
1307  int in_parallel;
1308  kmp_int32 nslots, nthunks, nshareds, nproc;
1309  kmpc_task_queue_t *new_queue, *curr_queue;
1310  kmpc_thunk_t *new_taskq_thunk;
1311  kmp_info_t *th;
1312  kmp_team_t *team;
1313  kmp_taskq_t *tq;
1314  kmp_int32 tid;
1315 
1316  KE_TRACE(10, ("__kmpc_taskq called (%d)\n", global_tid));
1317 
1318  th = __kmp_threads[global_tid];
1319  team = th->th.th_team;
1320  tq = &team->t.t_taskq;
1321  nproc = team->t.t_nproc;
1322  tid = __kmp_tid_from_gtid(global_tid);
1323 
1324  /* find out whether this is a parallel taskq or serialized one. */
1325  in_parallel = in_parallel_context(team);
1326 
1327  if (!tq->tq_root) {
1328  if (in_parallel) {
1329  /* Vector ORDERED SECTION to taskq version */
1330  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1331 
1332  /* Vector ORDERED SECTION to taskq version */
1333  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1334  }
1335 
1336  if (in_parallel) {
1337  // This shouldn't be a barrier region boundary, it will confuse the user.
1338  /* Need the boundary to be at the end taskq instead. */
1339  if (__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1340  /* Creating the active root queue, and we are not the master thread. */
1341  /* The master thread below created the queue and tasks have been */
1342  /* enqueued, and the master thread released this barrier. This */
1343  /* worker thread can now proceed and execute tasks. See also the */
1344  /* TQF_RELEASE_WORKERS which is used to handle this case. */
1345  *shareds =
1346  CCAST(kmpc_shared_vars_t *, tq->tq_root->tq_shareds[tid].ai_data);
1347  KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1348 
1349  return NULL;
1350  }
1351  }
1352 
1353  /* master thread only executes this code */
1354  if (tq->tq_curr_thunk_capacity < nproc) {
1355  if (tq->tq_curr_thunk)
1356  __kmp_free(tq->tq_curr_thunk);
1357  else {
1358  /* only need to do this once at outer level, i.e. when tq_curr_thunk is
1359  * still NULL */
1360  __kmp_init_lock(&tq->tq_freelist_lck);
1361  }
1362 
1363  tq->tq_curr_thunk =
1364  (kmpc_thunk_t **)__kmp_allocate(nproc * sizeof(kmpc_thunk_t *));
1365  tq->tq_curr_thunk_capacity = nproc;
1366  }
1367 
1368  if (in_parallel)
1369  tq->tq_global_flags = TQF_RELEASE_WORKERS;
1370  }
1371 
1372  /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
1373  /* on some heuristics (e.g., depth of queue nesting?). */
1374  nslots = (in_parallel) ? (2 * nproc) : 1;
1375 
1376  /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
1377  /* jobs being executed by other threads, and one extra for taskq slot */
1378  nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1)
1379  : nslots + 2;
1380 
1381  /* Only the root taskq gets a per-thread array of shareds. */
1382  /* The rest of the taskq's only get one copy of the shared vars. */
1383  nshareds = (!tq->tq_root && in_parallel) ? nproc : 1;
1384 
1385  /* create overall queue data structure and its components that require
1386  * allocation */
1387  new_queue = __kmp_alloc_taskq(tq, in_parallel, nslots, nthunks, nshareds,
1388  nproc, sizeof_thunk, sizeof_shareds,
1389  &new_taskq_thunk, global_tid);
1390 
1391  /* rest of new_queue initializations */
1392  new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS;
1393 
1394  if (in_parallel) {
1395  new_queue->tq_tasknum_queuing = 0;
1396  new_queue->tq_tasknum_serving = 0;
1397  new_queue->tq_flags |= TQF_PARALLEL_CONTEXT;
1398  }
1399 
1400  new_queue->tq_taskq_slot = NULL;
1401  new_queue->tq_nslots = nslots;
1402  new_queue->tq_hiwat = HIGH_WATER_MARK(nslots);
1403  new_queue->tq_nfull = 0;
1404  new_queue->tq_head = 0;
1405  new_queue->tq_tail = 0;
1406  new_queue->tq_loc = loc;
1407 
1408  if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
1409  /* prepare to serve the first-queued task's ORDERED directive */
1410  new_queue->tq_tasknum_serving = 1;
1411 
1412  /* Vector ORDERED SECTION to taskq version */
1413  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1414 
1415  /* Vector ORDERED SECTION to taskq version */
1416  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1417  }
1418 
1419  /* create a new thunk for the taskq_task in the new_queue */
1420  *shareds = CCAST(kmpc_shared_vars_t *, new_queue->tq_shareds[0].ai_data);
1421 
1422  new_taskq_thunk->th.th_shareds = *shareds;
1423  new_taskq_thunk->th_task = taskq_task;
1424  new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK;
1425  new_taskq_thunk->th_status = 0;
1426 
1427  KMP_DEBUG_ASSERT(new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
1428 
1429  // Make sure these inits complete before threads start using this queue
1430  /* KMP_MB(); */ // (necessary?)
1431 
1432  /* insert the new task queue into the tree, but only after all fields
1433  * initialized */
1434 
1435  if (in_parallel) {
1436  if (!tq->tq_root) {
1437  new_queue->tq.tq_parent = NULL;
1438  new_queue->tq_first_child = NULL;
1439  new_queue->tq_next_child = NULL;
1440  new_queue->tq_prev_child = NULL;
1441  new_queue->tq_ref_count = 1;
1442  tq->tq_root = new_queue;
1443  } else {
1444  curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
1445  new_queue->tq.tq_parent = curr_queue;
1446  new_queue->tq_first_child = NULL;
1447  new_queue->tq_prev_child = NULL;
1448  new_queue->tq_ref_count =
1449  1; /* for this the thread that built the queue */
1450 
1451  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n", __LINE__,
1452  global_tid, new_queue, new_queue->tq_ref_count));
1453 
1454  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1455 
1456  // Make sure data structures are in consistent state before querying them
1457  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1458  KMP_MB();
1459 
1460  new_queue->tq_next_child =
1461  CCAST(struct kmpc_task_queue_t *, curr_queue->tq_first_child);
1462 
1463  if (curr_queue->tq_first_child != NULL)
1464  curr_queue->tq_first_child->tq_prev_child = new_queue;
1465 
1466  curr_queue->tq_first_child = new_queue;
1467 
1468  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1469  }
1470 
1471  /* set up thunk stack only after code that determines curr_queue above */
1472  new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1473  tq->tq_curr_thunk[tid] = new_taskq_thunk;
1474 
1475  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1476  } else {
1477  new_taskq_thunk->th_encl_thunk = 0;
1478  new_queue->tq.tq_parent = NULL;
1479  new_queue->tq_first_child = NULL;
1480  new_queue->tq_next_child = NULL;
1481  new_queue->tq_prev_child = NULL;
1482  new_queue->tq_ref_count = 1;
1483  }
1484 
1485 #ifdef KMP_DEBUG
1486  KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
1487  KF_DUMP(150, __kmp_dump_thunk(tq, new_taskq_thunk, global_tid));
1488 
1489  if (in_parallel) {
1490  KF_TRACE(25,
1491  ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1492  } else {
1493  KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue,
1494  global_tid));
1495  }
1496 
1497  KF_DUMP(25, __kmp_dump_task_queue(tq, new_queue, global_tid));
1498 
1499  if (in_parallel) {
1500  KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1501  }
1502 #endif /* KMP_DEBUG */
1503 
1504  if (__kmp_env_consistency_check)
1505  __kmp_push_workshare(global_tid, ct_taskq, new_queue->tq_loc);
1506 
1507  KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1508 
1509  return new_taskq_thunk;
1510 }
1511 
1512 /* ends a taskq; last thread out destroys the queue */
1513 
1514 void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
1515  kmpc_thunk_t *taskq_thunk) {
1516 #ifdef KMP_DEBUG
1517  kmp_int32 i;
1518 #endif
1519  kmp_taskq_t *tq;
1520  int in_parallel;
1521  kmp_info_t *th;
1522  kmp_int32 is_outermost;
1523  kmpc_task_queue_t *queue;
1524  kmpc_thunk_t *thunk;
1525  int nproc;
1526 
1527  KE_TRACE(10, ("__kmpc_end_taskq called (%d)\n", global_tid));
1528 
1529  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1530  nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
1531 
1532  /* For the outermost taskq only, all but one thread will have taskq_thunk ==
1533  * NULL */
1534  queue = (taskq_thunk == NULL) ? tq->tq_root
1535  : taskq_thunk->th.th_shareds->sv_queue;
1536 
1537  KE_TRACE(50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
1538  is_outermost = (queue == tq->tq_root);
1539  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1540 
1541  if (in_parallel) {
1542  kmp_uint32 spins;
1543 
1544  /* this is just a safeguard to release the waiting threads if */
1545  /* the outermost taskq never queues a task */
1546 
1547  if (is_outermost && (KMP_MASTER_GTID(global_tid))) {
1548  if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
1549  /* no lock needed, workers are still in spin mode */
1550  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
1551 
1552  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1553  }
1554  }
1555 
1556  /* keep dequeueing work until all tasks are queued and dequeued */
1557 
1558  do {
1559  /* wait until something is available to dequeue */
1560  KMP_INIT_YIELD(spins);
1561 
1562  while ((queue->tq_nfull == 0) && (queue->tq_taskq_slot == NULL) &&
1563  (!__kmp_taskq_has_any_children(queue)) &&
1564  (!(queue->tq_flags & TQF_ALL_TASKS_QUEUED))) {
1565  KMP_YIELD_WHEN(TRUE, spins);
1566  }
1567 
1568  /* check to see if we can execute tasks in the queue */
1569  while (((queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL)) &&
1570  (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL) {
1571  KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk,
1572  queue, global_tid));
1573  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1574  }
1575 
1576  /* see if work found can be found in a descendant queue */
1577  if ((__kmp_taskq_has_any_children(queue)) &&
1578  (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1579  NULL) {
1580 
1581  KF_TRACE(50,
1582  ("Stole thunk: %p in descendant queue: %p while waiting in "
1583  "queue: %p (%d)\n",
1584  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1585 
1586  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1587  }
1588 
1589  } while ((!(queue->tq_flags & TQF_ALL_TASKS_QUEUED)) ||
1590  (queue->tq_nfull != 0));
1591 
1592  KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue,
1593  global_tid));
1594 
1595  /* wait while all tasks are not finished and more work found
1596  in descendant queues */
1597 
1598  while ((!__kmp_taskq_tasks_finished(queue)) &&
1599  (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1600  NULL) {
1601 
1602  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in "
1603  "queue: %p (%d)\n",
1604  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1605 
1606  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1607  }
1608 
1609  KF_TRACE(50, ("No work found in descendent queues or all work finished in "
1610  "queue: %p (%d)\n",
1611  queue, global_tid));
1612 
1613  if (!is_outermost) {
1614  /* need to return if NOWAIT present and not outermost taskq */
1615 
1616  if (queue->tq_flags & TQF_IS_NOWAIT) {
1617  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1618  queue->tq_ref_count--;
1619  KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
1620  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1621 
1622  KE_TRACE(
1623  10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
1624 
1625  return;
1626  }
1627 
1628  __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1629 
1630  /* WAIT until all tasks are finished and no child queues exist before
1631  * proceeding */
1632  KMP_INIT_YIELD(spins);
1633 
1634  while (!__kmp_taskq_tasks_finished(queue) ||
1635  __kmp_taskq_has_any_children(queue)) {
1636  thunk = __kmp_find_task_in_ancestor_queue(tq, global_tid, queue);
1637 
1638  if (thunk != NULL) {
1639  KF_TRACE(50,
1640  ("Stole thunk: %p in ancestor queue: %p while waiting in "
1641  "queue: %p (%d)\n",
1642  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1643  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk,
1644  in_parallel);
1645  }
1646 
1647  KMP_YIELD_WHEN(thunk == NULL, spins);
1648 
1649  __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1650  }
1651 
1652  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1653  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1654  queue->tq_flags |= TQF_DEALLOCATED;
1655  }
1656  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1657 
1658  /* only the allocating thread can deallocate the queue */
1659  if (taskq_thunk != NULL) {
1660  __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1661  }
1662 
1663  KE_TRACE(
1664  10,
1665  ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n",
1666  global_tid));
1667 
1668  return;
1669  }
1670 
1671  // Outermost Queue: steal work from descendants until all tasks are finished
1672 
1673  KMP_INIT_YIELD(spins);
1674 
1675  while (!__kmp_taskq_tasks_finished(queue)) {
1676  thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
1677 
1678  if (thunk != NULL) {
1679  KF_TRACE(50,
1680  ("Stole thunk: %p in descendant queue: %p while waiting in "
1681  "queue: %p (%d)\n",
1682  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1683 
1684  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1685  }
1686 
1687  KMP_YIELD_WHEN(thunk == NULL, spins);
1688  }
1689 
1690  /* Need this barrier to prevent destruction of queue before threads have all
1691  * executed above code */
1692  /* This may need to be done earlier when NOWAIT is implemented for the
1693  * outermost level */
1694 
1695  if (!__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1696  /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */
1697  /* for right now, everybody waits, and the master thread destroys the */
1698  /* remaining queues. */
1699 
1700  __kmp_remove_all_child_taskq(tq, global_tid, queue);
1701 
1702  /* Now destroy the root queue */
1703  KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n",
1704  global_tid, queue));
1705  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1706 
1707 #ifdef KMP_DEBUG
1708  /* the root queue entry */
1709  KMP_DEBUG_ASSERT((queue->tq.tq_parent == NULL) &&
1710  (queue->tq_next_child == NULL));
1711 
1712  /* children must all be gone by now because of barrier above */
1713  KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1714 
1715  for (i = 0; i < nproc; i++) {
1716  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1717  }
1718 
1719  for (i = 0, thunk = queue->tq_free_thunks; thunk != NULL;
1720  i++, thunk = thunk->th.th_next_free)
1721  ;
1722 
1723  KMP_DEBUG_ASSERT(i ==
1724  queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
1725 
1726  for (i = 0; i < nproc; i++) {
1727  KMP_DEBUG_ASSERT(!tq->tq_curr_thunk[i]);
1728  }
1729 #endif
1730  /* unlink the root queue entry */
1731  tq->tq_root = NULL;
1732 
1733  /* release storage for root queue entry */
1734  KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue,
1735  global_tid));
1736 
1737  queue->tq_flags |= TQF_DEALLOCATED;
1738  __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1739 
1740  KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1741 
1742  /* release the workers now that the data structures are up to date */
1743  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1744  }
1745 
1746  th = __kmp_threads[global_tid];
1747 
1748  /* Reset ORDERED SECTION to parallel version */
1749  th->th.th_dispatch->th_deo_fcn = 0;
1750 
1751  /* Reset ORDERED SECTION to parallel version */
1752  th->th.th_dispatch->th_dxo_fcn = 0;
1753  } else {
1754  /* in serial execution context, dequeue the last task */
1755  /* and execute it, if there were any tasks encountered */
1756 
1757  if (queue->tq_nfull > 0) {
1758  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1759 
1760  thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1761 
1762  if (queue->tq_flags & TQF_IS_LAST_TASK) {
1763  /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
1764  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
1765  /* instrumentation does copy-out. */
1766 
1767  /* no need for test_then_or call since already locked */
1768  thunk->th_flags |= TQF_IS_LAST_TASK;
1769  }
1770 
1771  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1772  thunk, queue));
1773 
1774  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1775  }
1776 
1777  // destroy the unattached serial queue now that there is no more work to do
1778  KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n",
1779  queue, global_tid));
1780  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1781 
1782 #ifdef KMP_DEBUG
1783  i = 0;
1784  for (thunk = queue->tq_free_thunks; thunk != NULL;
1785  thunk = thunk->th.th_next_free)
1786  ++i;
1787  KMP_DEBUG_ASSERT(i == queue->tq_nslots + 1);
1788 #endif
1789  /* release storage for unattached serial queue */
1790  KF_TRACE(50,
1791  ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
1792 
1793  queue->tq_flags |= TQF_DEALLOCATED;
1794  __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1795  }
1796 
1797  KE_TRACE(10, ("__kmpc_end_taskq return (%d)\n", global_tid));
1798 }
1799 
1800 /* Enqueues a task for thunk previously created by __kmpc_task_buffer. */
1801 /* Returns nonzero if just filled up queue */
1802 
1803 kmp_int32 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk) {
1804  kmp_int32 ret;
1805  kmpc_task_queue_t *queue;
1806  int in_parallel;
1807  kmp_taskq_t *tq;
1808 
1809  KE_TRACE(10, ("__kmpc_task called (%d)\n", global_tid));
1810 
1811  KMP_DEBUG_ASSERT(!(thunk->th_flags &
1812  TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */
1813 
1814  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1815  queue = thunk->th.th_shareds->sv_queue;
1816  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1817 
1818  if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
1819  thunk->th_tasknum = ++queue->tq_tasknum_queuing;
1820 
1821  /* For serial execution dequeue the preceding task and execute it, if one
1822  * exists */
1823  /* This cannot be the last task. That one is handled in __kmpc_end_taskq */
1824 
1825  if (!in_parallel && queue->tq_nfull > 0) {
1826  kmpc_thunk_t *prev_thunk;
1827 
1828  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1829 
1830  prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1831 
1832  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1833  prev_thunk, queue));
1834 
1835  __kmp_execute_task_from_queue(tq, loc, global_tid, prev_thunk, in_parallel);
1836  }
1837 
1838  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private
1839  variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the
1840  task queue is not full and allocates a thunk (which is then passed to
1841  __kmpc_task()). So, the enqueue below should never fail due to a full
1842  queue. */
1843 
1844  KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
1845  KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1846 
1847  ret = __kmp_enqueue_task(tq, global_tid, queue, thunk, in_parallel);
1848 
1849  KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
1850  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1851 
1852  KE_TRACE(10, ("__kmpc_task return (%d)\n", global_tid));
1853 
1854  return ret;
1855 }
1856 
1857 /* enqueues a taskq_task for thunk previously created by __kmpc_taskq */
1858 /* this should never be called unless in a parallel context */
1859 
1860 void __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk,
1861  kmp_int32 status) {
1862  kmpc_task_queue_t *queue;
1863  kmp_taskq_t *tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1864  int tid = __kmp_tid_from_gtid(global_tid);
1865 
1866  KE_TRACE(10, ("__kmpc_taskq_task called (%d)\n", global_tid));
1867  KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
1868  KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1869 
1870  queue = thunk->th.th_shareds->sv_queue;
1871 
1872  if (__kmp_env_consistency_check)
1873  __kmp_pop_workshare(global_tid, ct_taskq, loc);
1874 
1875  /* thunk->th_task is the taskq_task */
1876  KMP_DEBUG_ASSERT(thunk->th_flags & TQF_TASKQ_TASK);
1877 
1878  /* not supposed to call __kmpc_taskq_task if it's already enqueued */
1879  KMP_DEBUG_ASSERT(queue->tq_taskq_slot == NULL);
1880 
1881  /* dequeue taskq thunk from curr_thunk stack */
1882  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1883  thunk->th_encl_thunk = NULL;
1884 
1885  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1886 
1887  thunk->th_status = status;
1888 
1889  // Flush thunk->th_status before taskq_task enqueued to avoid race condition
1890  KMP_MB();
1891 
1892  /* enqueue taskq_task in thunk into special slot in queue */
1893  /* GEH - probably don't need to lock taskq slot since only one */
1894  /* thread enqueues & already a lock set at dequeue point */
1895 
1896  queue->tq_taskq_slot = thunk;
1897 
1898  KE_TRACE(10, ("__kmpc_taskq_task return (%d)\n", global_tid));
1899 }
1900 
1901 /* ends a taskq_task; done generating tasks */
1902 
1903 void __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid,
1904  kmpc_thunk_t *thunk) {
1905  kmp_taskq_t *tq;
1906  kmpc_task_queue_t *queue;
1907  int in_parallel;
1908  int tid;
1909 
1910  KE_TRACE(10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
1911 
1912  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1913  queue = thunk->th.th_shareds->sv_queue;
1914  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1915  tid = __kmp_tid_from_gtid(global_tid);
1916 
1917  if (__kmp_env_consistency_check)
1918  __kmp_pop_workshare(global_tid, ct_taskq, loc);
1919 
1920  if (in_parallel) {
1921 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1922  KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1923  TQF_ALL_TASKS_QUEUED);
1924 #else
1925  {
1926  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1927 
1928  // Make sure data structures are in consistent state before querying them
1929  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1930  KMP_MB();
1931 
1932  queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
1933  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1934  }
1935 #endif
1936  }
1937 
1938  if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
1939  /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in
1940  the queue if TQF_IS_LASTPRIVATE so we can positively identify that last
1941  task and run it with its TQF_IS_LAST_TASK bit turned on in th_flags.
1942  When __kmpc_end_taskq_task() is called we are done generating all the
1943  tasks, so we know the last one in the queue is the lastprivate task.
1944  Mark the queue as having gotten to this state via tq_flags &
1945  TQF_IS_LAST_TASK; when that task actually executes mark it via th_flags &
1946  TQF_IS_LAST_TASK (this th_flags bit signals the instrumented code to do
1947  copy-outs after execution). */
1948  if (!in_parallel) {
1949  /* No synchronization needed for serial context */
1950  queue->tq_flags |= TQF_IS_LAST_TASK;
1951  } else {
1952 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1953  KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1954  TQF_IS_LAST_TASK);
1955 #else
1956  {
1957  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1958 
1959  // Make sure data structures in consistent state before querying them
1960  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1961  KMP_MB();
1962 
1963  queue->tq_flags |= TQF_IS_LAST_TASK;
1964  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1965  }
1966 #endif
1967  /* to prevent race condition where last task is dequeued but */
1968  /* flag isn't visible yet (not sure about this) */
1969  KMP_MB();
1970  }
1971  }
1972 
1973  /* dequeue taskq thunk from curr_thunk stack */
1974  if (in_parallel) {
1975  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1976  thunk->th_encl_thunk = NULL;
1977 
1978  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1979  }
1980 
1981  KE_TRACE(10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
1982 }
1983 
1984 /* returns thunk for a regular task based on taskq_thunk */
1985 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */
1986 
1987 kmpc_thunk_t *__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid,
1988  kmpc_thunk_t *taskq_thunk, kmpc_task_t task) {
1989  kmp_taskq_t *tq;
1990  kmpc_task_queue_t *queue;
1991  kmpc_thunk_t *new_thunk;
1992  int in_parallel;
1993 
1994  KE_TRACE(10, ("__kmpc_task_buffer called (%d)\n", global_tid));
1995 
1996  KMP_DEBUG_ASSERT(
1997  taskq_thunk->th_flags &
1998  TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */
1999 
2000  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
2001  queue = taskq_thunk->th.th_shareds->sv_queue;
2002  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
2003 
2004  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private
2005  variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the
2006  task queue is not full and allocates a thunk (which is then passed to
2007  __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be
2008  the next to be enqueued in __kmpc_task(). */
2009 
2010  new_thunk = __kmp_alloc_thunk(queue, in_parallel, global_tid);
2011  new_thunk->th.th_shareds =
2012  CCAST(kmpc_shared_vars_t *, queue->tq_shareds[0].ai_data);
2013  new_thunk->th_encl_thunk = NULL;
2014  new_thunk->th_task = task;
2015 
2016  /* GEH - shouldn't need to lock the read of tq_flags here */
2017  new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS;
2018 
2019  new_thunk->th_status = 0;
2020 
2021  KMP_DEBUG_ASSERT(!(new_thunk->th_flags & TQF_TASKQ_TASK));
2022 
2023  KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
2024  KF_DUMP(100, __kmp_dump_thunk(tq, new_thunk, global_tid));
2025 
2026  KE_TRACE(10, ("__kmpc_task_buffer return (%d)\n", global_tid));
2027 
2028  return new_thunk;
2029 }
Definition: kmp.h:224
KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid)