LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 #include "kmp_dispatch.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 
36 /* these are temporary issues to be dealt with */
37 #define KMP_USE_PRCTL 0
38 
39 #if KMP_OS_WINDOWS
40 #include <process.h>
41 #endif
42 
43 #include "tsan_annotations.h"
44 
45 #if defined(KMP_GOMP_COMPAT)
46 char const __kmp_version_alt_comp[] =
47  KMP_VERSION_PREFIX "alternative compiler support: yes";
48 #endif /* defined(KMP_GOMP_COMPAT) */
49 
50 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
51 #if OMP_50_ENABLED
52  "5.0 (201611)";
53 #elif OMP_45_ENABLED
54  "4.5 (201511)";
55 #elif OMP_40_ENABLED
56  "4.0 (201307)";
57 #else
58  "3.1 (201107)";
59 #endif
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63  KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79  int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81  kmp_internal_control_t *new_icvs,
82  ident_t *loc);
83 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85  int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91  kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109  int i;
110  kmp_info_t **other_threads;
111  size_t stack_data;
112  char *stack_addr;
113  size_t stack_size;
114  char *stack_base;
115 
116  KA_TRACE(
117  1000,
118  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
119  __kmp_nth, __kmp_all_nth));
120 
121  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124  __kmp_init_gtid for this to work. */
125 
126  if (!TCR_4(__kmp_init_gtid))
127  return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130  if (TCR_4(__kmp_gtid_mode) >= 3) {
131  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132  return __kmp_gtid;
133  }
134 #endif
135  if (TCR_4(__kmp_gtid_mode) >= 2) {
136  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137  return __kmp_gtid_get_specific();
138  }
139  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141  stack_addr = (char *)&stack_data;
142  other_threads = __kmp_threads;
143 
144  /* ATT: The code below is a source of potential bugs due to unsynchronized
145  access to __kmp_threads array. For example:
146  1. Current thread loads other_threads[i] to thr and checks it, it is
147  non-NULL.
148  2. Current thread is suspended by OS.
149  3. Another thread unregisters and finishes (debug versions of free()
150  may fill memory with something like 0xEF).
151  4. Current thread is resumed.
152  5. Current thread reads junk from *thr.
153  TODO: Fix it. --ln */
154 
155  for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158  if (!thr)
159  continue;
160 
161  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164  /* stack grows down -- search through all of the active threads */
165 
166  if (stack_addr <= stack_base) {
167  size_t stack_diff = stack_base - stack_addr;
168 
169  if (stack_diff <= stack_size) {
170  /* The only way we can be closer than the allocated */
171  /* stack size is if we are running on this thread. */
172  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173  return i;
174  }
175  }
176  }
177 
178  /* get specific to try and determine our gtid */
179  KA_TRACE(1000,
180  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181  "thread, using TLS\n"));
182  i = __kmp_gtid_get_specific();
183 
184  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
185 
186  /* if we havn't been assigned a gtid, then return code */
187  if (i < 0)
188  return i;
189 
190  /* dynamically updated stack window for uber threads to avoid get_specific
191  call */
192  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193  KMP_FATAL(StackOverflow, i);
194  }
195 
196  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197  if (stack_addr > stack_base) {
198  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201  stack_base);
202  } else {
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204  stack_base - stack_addr);
205  }
206 
207  /* Reprint stack bounds for ubermaster since they have been refined */
208  if (__kmp_storage_map) {
209  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212  other_threads[i]->th.th_info.ds.ds_stacksize,
213  "th_%d stack (refinement)", i);
214  }
215  return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219  int gtid;
220 
221  if (!__kmp_init_serial) {
222  gtid = KMP_GTID_DNE;
223  } else
224 #ifdef KMP_TDATA_GTID
225  if (TCR_4(__kmp_gtid_mode) >= 3) {
226  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227  gtid = __kmp_gtid;
228  } else
229 #endif
230  if (TCR_4(__kmp_gtid_mode) >= 2) {
231  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232  gtid = __kmp_gtid_get_specific();
233  } else {
234  KA_TRACE(1000,
235  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236  gtid = __kmp_get_global_thread_id();
237  }
238 
239  /* we must be a new uber master sibling thread */
240  if (gtid == KMP_GTID_DNE) {
241  KA_TRACE(10,
242  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243  "Registering a new gtid.\n"));
244  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245  if (!__kmp_init_serial) {
246  __kmp_do_serial_initialize();
247  gtid = __kmp_gtid_get_specific();
248  } else {
249  gtid = __kmp_register_root(FALSE);
250  }
251  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253  }
254 
255  KMP_DEBUG_ASSERT(gtid >= 0);
256 
257  return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262  int f;
263  char *stack_beg = NULL;
264  char *stack_end = NULL;
265  int gtid;
266 
267  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268  if (__kmp_storage_map) {
269  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272  gtid = __kmp_gtid_from_thread(th);
273 
274  if (gtid == KMP_GTID_MONITOR) {
275  __kmp_print_storage_map_gtid(
276  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277  "th_%s stack (%s)", "mon",
278  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279  } else {
280  __kmp_print_storage_map_gtid(
281  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282  "th_%d stack (%s)", gtid,
283  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284  }
285  }
286 
287  /* No point in checking ubermaster threads since they use refinement and
288  * cannot overlap */
289  gtid = __kmp_gtid_from_thread(th);
290  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291  KA_TRACE(10,
292  ("__kmp_check_stack_overlap: performing extensive checking\n"));
293  if (stack_beg == NULL) {
294  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296  }
297 
298  for (f = 0; f < __kmp_threads_capacity; f++) {
299  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301  if (f_th && f_th != th) {
302  char *other_stack_end =
303  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304  char *other_stack_beg =
305  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309  /* Print the other stack values before the abort */
310  if (__kmp_storage_map)
311  __kmp_print_storage_map_gtid(
312  -1, other_stack_beg, other_stack_end,
313  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317  __kmp_msg_null);
318  }
319  }
320  }
321  }
322  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328  static int done = FALSE;
329 
330  while (!done) {
331  KMP_YIELD(1);
332  }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338  char const *format, ...) {
339  char buffer[MAX_MESSAGE];
340  va_list ap;
341 
342  va_start(ap, format);
343  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344  p2, (unsigned long)size, format);
345  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346  __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348  int node;
349  if (gtid >= 0) {
350  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351  if (__kmp_storage_map_verbose) {
352  node = __kmp_get_host_node(p1);
353  if (node < 0) /* doesn't work, so don't try this next time */
354  __kmp_storage_map_verbose = FALSE;
355  else {
356  char *last;
357  int lastNode;
358  int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360  const int page_size = KMP_GET_PAGE_SIZE();
361 
362  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364  if (localProc >= 0)
365  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
366  localProc >> 1);
367  else
368  __kmp_printf_no_lock(" GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370  /* The more elaborate format is disabled for now because of the prctl
371  * hanging bug. */
372  do {
373  last = p1;
374  lastNode = node;
375  /* This loop collates adjacent pages with the same host node. */
376  do {
377  (char *)p1 += page_size;
378  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
380  lastNode);
381  } while (p1 <= p2);
382 #else
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
384  (char *)p1 + (page_size - 1),
385  __kmp_get_host_node(p1));
386  if (p1 < p2) {
387  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
388  (char *)p2 + (page_size - 1),
389  __kmp_get_host_node(p2));
390  }
391 #endif
392  }
393  }
394  } else
395  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
396  }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402  char buffer[MAX_MESSAGE];
403  va_list ap;
404 
405  if (__kmp_generate_warnings == kmp_warnings_off) {
406  return;
407  }
408 
409  va_start(ap, format);
410 
411  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413  __kmp_vprintf(kmp_err, buffer, ap);
414  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416  va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420  // Later threads may stall here, but that's ok because abort() will kill them.
421  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423  if (__kmp_debug_buf) {
424  __kmp_dump_debug_buffer();
425  }
426 
427  if (KMP_OS_WINDOWS) {
428  // Let other threads know of abnormal termination and prevent deadlock
429  // if abort happened during library initialization or shutdown
430  __kmp_global.g.g_abort = SIGABRT;
431 
432  /* On Windows* OS by default abort() causes pop-up error box, which stalls
433  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434  boxes. _set_abort_behavior() works well, but this function is not
435  available in VS7 (this is not problem for DLL, but it is a problem for
436  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437  help, at least in some versions of MS C RTL.
438 
439  It seems following sequence is the only way to simulate abort() and
440  avoid pop-up error box. */
441  raise(SIGABRT);
442  _exit(3); // Just in case, if signal ignored, exit anyway.
443  } else {
444  abort();
445  }
446 
447  __kmp_infinite_loop();
448  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453  // TODO: Eliminate g_abort global variable and this function.
454  // In case of abort just call abort(), it will kill all the threads.
455  __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459  that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463  gtid);
464 
465  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469  sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471  __kmp_print_storage_map_gtid(
472  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476  &thr->th.th_bar[bs_plain_barrier + 1],
477  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478  gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481  &thr->th.th_bar[bs_forkjoin_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483  gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487  &thr->th.th_bar[bs_reduction_barrier + 1],
488  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489  gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494  that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497  int team_id, int num_thr) {
498  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500  header, team_id);
501 
502  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503  &team->t.t_bar[bs_last_barrier],
504  sizeof(kmp_balign_team_t) * bs_last_barrier,
505  "%s_%d.t_bar", header, team_id);
506 
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508  &team->t.t_bar[bs_plain_barrier + 1],
509  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513  &team->t.t_bar[bs_forkjoin_barrier + 1],
514  sizeof(kmp_balign_team_t),
515  "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519  &team->t.t_bar[bs_reduction_barrier + 1],
520  sizeof(kmp_balign_team_t),
521  "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524  __kmp_print_storage_map_gtid(
525  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528  __kmp_print_storage_map_gtid(
529  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533  &team->t.t_disp_buffer[num_disp_buff],
534  sizeof(dispatch_shared_info_t) * num_disp_buff,
535  "%s_%d.t_disp_buffer", header, team_id);
536 
537  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
538  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
539  team_id);
540 }
541 
542 static void __kmp_init_allocator() {
543 #if OMP_50_ENABLED
544  __kmp_init_memkind();
545 #endif
546 }
547 static void __kmp_fini_allocator() {
548 #if OMP_50_ENABLED
549  __kmp_fini_memkind();
550 #endif
551 }
552 
553 /* ------------------------------------------------------------------------ */
554 
555 #if KMP_DYNAMIC_LIB
556 #if KMP_OS_WINDOWS
557 
558 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
559  // TODO: Change to __kmp_break_bootstrap_lock().
560  __kmp_init_bootstrap_lock(lck); // make the lock released
561 }
562 
563 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
564  int i;
565  int thread_count;
566 
567  // PROCESS_DETACH is expected to be called by a thread that executes
568  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
569  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
570  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
571  // threads can be still alive here, although being about to be terminated. The
572  // threads in the array with ds_thread==0 are most suspicious. Actually, it
573  // can be not safe to access the __kmp_threads[].
574 
575  // TODO: does it make sense to check __kmp_roots[] ?
576 
577  // Let's check that there are no other alive threads registered with the OMP
578  // lib.
579  while (1) {
580  thread_count = 0;
581  for (i = 0; i < __kmp_threads_capacity; ++i) {
582  if (!__kmp_threads)
583  continue;
584  kmp_info_t *th = __kmp_threads[i];
585  if (th == NULL)
586  continue;
587  int gtid = th->th.th_info.ds.ds_gtid;
588  if (gtid == gtid_req)
589  continue;
590  if (gtid < 0)
591  continue;
592  DWORD exit_val;
593  int alive = __kmp_is_thread_alive(th, &exit_val);
594  if (alive) {
595  ++thread_count;
596  }
597  }
598  if (thread_count == 0)
599  break; // success
600  }
601 
602  // Assume that I'm alone. Now it might be safe to check and reset locks.
603  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
604  __kmp_reset_lock(&__kmp_forkjoin_lock);
605 #ifdef KMP_DEBUG
606  __kmp_reset_lock(&__kmp_stdio_lock);
607 #endif // KMP_DEBUG
608 }
609 
610 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
611  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
612 
613  switch (fdwReason) {
614 
615  case DLL_PROCESS_ATTACH:
616  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
617 
618  return TRUE;
619 
620  case DLL_PROCESS_DETACH:
621  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
622 
623  if (lpReserved != NULL) {
624  // lpReserved is used for telling the difference:
625  // lpReserved == NULL when FreeLibrary() was called,
626  // lpReserved != NULL when the process terminates.
627  // When FreeLibrary() is called, worker threads remain alive. So they will
628  // release the forkjoin lock by themselves. When the process terminates,
629  // worker threads disappear triggering the problem of unreleased forkjoin
630  // lock as described below.
631 
632  // A worker thread can take the forkjoin lock. The problem comes up if
633  // that worker thread becomes dead before it releases the forkjoin lock.
634  // The forkjoin lock remains taken, while the thread executing
635  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
636  // to take the forkjoin lock and will always fail, so that the application
637  // will never finish [normally]. This scenario is possible if
638  // __kmpc_end() has not been executed. It looks like it's not a corner
639  // case, but common cases:
640  // - the main function was compiled by an alternative compiler;
641  // - the main function was compiled by icl but without /Qopenmp
642  // (application with plugins);
643  // - application terminates by calling C exit(), Fortran CALL EXIT() or
644  // Fortran STOP.
645  // - alive foreign thread prevented __kmpc_end from doing cleanup.
646  //
647  // This is a hack to work around the problem.
648  // TODO: !!! figure out something better.
649  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
650  }
651 
652  __kmp_internal_end_library(__kmp_gtid_get_specific());
653 
654  return TRUE;
655 
656  case DLL_THREAD_ATTACH:
657  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
658 
659  /* if we want to register new siblings all the time here call
660  * __kmp_get_gtid(); */
661  return TRUE;
662 
663  case DLL_THREAD_DETACH:
664  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
665 
666  __kmp_internal_end_thread(__kmp_gtid_get_specific());
667  return TRUE;
668  }
669 
670  return TRUE;
671 }
672 
673 #endif /* KMP_OS_WINDOWS */
674 #endif /* KMP_DYNAMIC_LIB */
675 
676 /* Change the library type to "status" and return the old type */
677 /* called from within initialization routines where __kmp_initz_lock is held */
678 int __kmp_change_library(int status) {
679  int old_status;
680 
681  old_status = __kmp_yield_init &
682  1; // check whether KMP_LIBRARY=throughput (even init count)
683 
684  if (status) {
685  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
686  } else {
687  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
688  }
689 
690  return old_status; // return previous setting of whether
691  // KMP_LIBRARY=throughput
692 }
693 
694 /* __kmp_parallel_deo -- Wait until it's our turn. */
695 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696  int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698  kmp_team_t *team = __kmp_team_from_gtid(gtid);
699 #endif /* BUILD_PARALLEL_ORDERED */
700 
701  if (__kmp_env_consistency_check) {
702  if (__kmp_threads[gtid]->th.th_root->r.r_active)
703 #if KMP_USE_DYNAMIC_LOCK
704  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
705 #else
706  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
707 #endif
708  }
709 #ifdef BUILD_PARALLEL_ORDERED
710  if (!team->t.t_serialized) {
711  KMP_MB();
712  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
713  KMP_EQ, NULL);
714  KMP_MB();
715  }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* __kmp_parallel_dxo -- Signal the next task. */
720 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
721  int gtid = *gtid_ref;
722 #ifdef BUILD_PARALLEL_ORDERED
723  int tid = __kmp_tid_from_gtid(gtid);
724  kmp_team_t *team = __kmp_team_from_gtid(gtid);
725 #endif /* BUILD_PARALLEL_ORDERED */
726 
727  if (__kmp_env_consistency_check) {
728  if (__kmp_threads[gtid]->th.th_root->r.r_active)
729  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
730  }
731 #ifdef BUILD_PARALLEL_ORDERED
732  if (!team->t.t_serialized) {
733  KMP_MB(); /* Flush all pending memory write invalidates. */
734 
735  /* use the tid of the next thread in this team */
736  /* TODO replace with general release procedure */
737  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
738 
739  KMP_MB(); /* Flush all pending memory write invalidates. */
740  }
741 #endif /* BUILD_PARALLEL_ORDERED */
742 }
743 
744 /* ------------------------------------------------------------------------ */
745 /* The BARRIER for a SINGLE process section is always explicit */
746 
747 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
748  int status;
749  kmp_info_t *th;
750  kmp_team_t *team;
751 
752  if (!TCR_4(__kmp_init_parallel))
753  __kmp_parallel_initialize();
754 
755  th = __kmp_threads[gtid];
756  team = th->th.th_team;
757  status = 0;
758 
759  th->th.th_ident = id_ref;
760 
761  if (team->t.t_serialized) {
762  status = 1;
763  } else {
764  kmp_int32 old_this = th->th.th_local.this_construct;
765 
766  ++th->th.th_local.this_construct;
767  /* try to set team count to thread count--success means thread got the
768  single block */
769  /* TODO: Should this be acquire or release? */
770  if (team->t.t_construct == old_this) {
771  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
772  th->th.th_local.this_construct);
773  }
774 #if USE_ITT_BUILD
775  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
776  KMP_MASTER_GTID(gtid) &&
777 #if OMP_40_ENABLED
778  th->th.th_teams_microtask == NULL &&
779 #endif
780  team->t.t_active_level ==
781  1) { // Only report metadata by master of active team at level 1
782  __kmp_itt_metadata_single(id_ref);
783  }
784 #endif /* USE_ITT_BUILD */
785  }
786 
787  if (__kmp_env_consistency_check) {
788  if (status && push_ws) {
789  __kmp_push_workshare(gtid, ct_psingle, id_ref);
790  } else {
791  __kmp_check_workshare(gtid, ct_psingle, id_ref);
792  }
793  }
794 #if USE_ITT_BUILD
795  if (status) {
796  __kmp_itt_single_start(gtid);
797  }
798 #endif /* USE_ITT_BUILD */
799  return status;
800 }
801 
802 void __kmp_exit_single(int gtid) {
803 #if USE_ITT_BUILD
804  __kmp_itt_single_end(gtid);
805 #endif /* USE_ITT_BUILD */
806  if (__kmp_env_consistency_check)
807  __kmp_pop_workshare(gtid, ct_psingle, NULL);
808 }
809 
810 /* determine if we can go parallel or must use a serialized parallel region and
811  * how many threads we can use
812  * set_nproc is the number of threads requested for the team
813  * returns 0 if we should serialize or only use one thread,
814  * otherwise the number of threads to use
815  * The forkjoin lock is held by the caller. */
816 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
817  int master_tid, int set_nthreads
818 #if OMP_40_ENABLED
819  ,
820  int enter_teams
821 #endif /* OMP_40_ENABLED */
822  ) {
823  int capacity;
824  int new_nthreads;
825  KMP_DEBUG_ASSERT(__kmp_init_serial);
826  KMP_DEBUG_ASSERT(root && parent_team);
827 
828  // If dyn-var is set, dynamically adjust the number of desired threads,
829  // according to the method specified by dynamic_mode.
830  new_nthreads = set_nthreads;
831  if (!get__dynamic_2(parent_team, master_tid)) {
832  ;
833  }
834 #ifdef USE_LOAD_BALANCE
835  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
836  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
837  if (new_nthreads == 1) {
838  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
839  "reservation to 1 thread\n",
840  master_tid));
841  return 1;
842  }
843  if (new_nthreads < set_nthreads) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
845  "reservation to %d threads\n",
846  master_tid, new_nthreads));
847  }
848  }
849 #endif /* USE_LOAD_BALANCE */
850  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
851  new_nthreads = __kmp_avail_proc - __kmp_nth +
852  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
853  if (new_nthreads <= 1) {
854  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
855  "reservation to 1 thread\n",
856  master_tid));
857  return 1;
858  }
859  if (new_nthreads < set_nthreads) {
860  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
861  "reservation to %d threads\n",
862  master_tid, new_nthreads));
863  } else {
864  new_nthreads = set_nthreads;
865  }
866  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
867  if (set_nthreads > 2) {
868  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
869  new_nthreads = (new_nthreads % set_nthreads) + 1;
870  if (new_nthreads == 1) {
871  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
872  "reservation to 1 thread\n",
873  master_tid));
874  return 1;
875  }
876  if (new_nthreads < set_nthreads) {
877  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
878  "reservation to %d threads\n",
879  master_tid, new_nthreads));
880  }
881  }
882  } else {
883  KMP_ASSERT(0);
884  }
885 
886  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
887  if (__kmp_nth + new_nthreads -
888  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
889  __kmp_max_nth) {
890  int tl_nthreads = __kmp_max_nth - __kmp_nth +
891  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
892  if (tl_nthreads <= 0) {
893  tl_nthreads = 1;
894  }
895 
896  // If dyn-var is false, emit a 1-time warning.
897  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
898  __kmp_reserve_warn = 1;
899  __kmp_msg(kmp_ms_warning,
900  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
901  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
902  }
903  if (tl_nthreads == 1) {
904  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
905  "reduced reservation to 1 thread\n",
906  master_tid));
907  return 1;
908  }
909  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
910  "reservation to %d threads\n",
911  master_tid, tl_nthreads));
912  new_nthreads = tl_nthreads;
913  }
914 
915  // Respect OMP_THREAD_LIMIT
916  if (root->r.r_cg_nthreads + new_nthreads -
917  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
918  __kmp_cg_max_nth) {
919  int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
920  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
921  if (tl_nthreads <= 0) {
922  tl_nthreads = 1;
923  }
924 
925  // If dyn-var is false, emit a 1-time warning.
926  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
927  __kmp_reserve_warn = 1;
928  __kmp_msg(kmp_ms_warning,
929  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
930  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
931  }
932  if (tl_nthreads == 1) {
933  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
934  "reduced reservation to 1 thread\n",
935  master_tid));
936  return 1;
937  }
938  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
939  "reservation to %d threads\n",
940  master_tid, tl_nthreads));
941  new_nthreads = tl_nthreads;
942  }
943 
944  // Check if the threads array is large enough, or needs expanding.
945  // See comment in __kmp_register_root() about the adjustment if
946  // __kmp_threads[0] == NULL.
947  capacity = __kmp_threads_capacity;
948  if (TCR_PTR(__kmp_threads[0]) == NULL) {
949  --capacity;
950  }
951  if (__kmp_nth + new_nthreads -
952  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
953  capacity) {
954  // Expand the threads array.
955  int slotsRequired = __kmp_nth + new_nthreads -
956  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
957  capacity;
958  int slotsAdded = __kmp_expand_threads(slotsRequired);
959  if (slotsAdded < slotsRequired) {
960  // The threads array was not expanded enough.
961  new_nthreads -= (slotsRequired - slotsAdded);
962  KMP_ASSERT(new_nthreads >= 1);
963 
964  // If dyn-var is false, emit a 1-time warning.
965  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
966  __kmp_reserve_warn = 1;
967  if (__kmp_tp_cached) {
968  __kmp_msg(kmp_ms_warning,
969  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
970  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
971  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
972  } else {
973  __kmp_msg(kmp_ms_warning,
974  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
975  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
976  }
977  }
978  }
979  }
980 
981 #ifdef KMP_DEBUG
982  if (new_nthreads == 1) {
983  KC_TRACE(10,
984  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
985  "dead roots and rechecking; requested %d threads\n",
986  __kmp_get_gtid(), set_nthreads));
987  } else {
988  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
989  " %d threads\n",
990  __kmp_get_gtid(), new_nthreads, set_nthreads));
991  }
992 #endif // KMP_DEBUG
993  return new_nthreads;
994 }
995 
996 /* Allocate threads from the thread pool and assign them to the new team. We are
997  assured that there are enough threads available, because we checked on that
998  earlier within critical section forkjoin */
999 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
1000  kmp_info_t *master_th, int master_gtid) {
1001  int i;
1002  int use_hot_team;
1003 
1004  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
1005  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
1006  KMP_MB();
1007 
1008  /* first, let's setup the master thread */
1009  master_th->th.th_info.ds.ds_tid = 0;
1010  master_th->th.th_team = team;
1011  master_th->th.th_team_nproc = team->t.t_nproc;
1012  master_th->th.th_team_master = master_th;
1013  master_th->th.th_team_serialized = FALSE;
1014  master_th->th.th_dispatch = &team->t.t_dispatch[0];
1015 
1016 /* make sure we are not the optimized hot team */
1017 #if KMP_NESTED_HOT_TEAMS
1018  use_hot_team = 0;
1019  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1020  if (hot_teams) { // hot teams array is not allocated if
1021  // KMP_HOT_TEAMS_MAX_LEVEL=0
1022  int level = team->t.t_active_level - 1; // index in array of hot teams
1023  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1024  if (master_th->th.th_teams_size.nteams > 1) {
1025  ++level; // level was not increased in teams construct for
1026  // team_of_masters
1027  }
1028  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1029  master_th->th.th_teams_level == team->t.t_level) {
1030  ++level; // level was not increased in teams construct for
1031  // team_of_workers before the parallel
1032  } // team->t.t_level will be increased inside parallel
1033  }
1034  if (level < __kmp_hot_teams_max_level) {
1035  if (hot_teams[level].hot_team) {
1036  // hot team has already been allocated for given level
1037  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1038  use_hot_team = 1; // the team is ready to use
1039  } else {
1040  use_hot_team = 0; // AC: threads are not allocated yet
1041  hot_teams[level].hot_team = team; // remember new hot team
1042  hot_teams[level].hot_team_nth = team->t.t_nproc;
1043  }
1044  } else {
1045  use_hot_team = 0;
1046  }
1047  }
1048 #else
1049  use_hot_team = team == root->r.r_hot_team;
1050 #endif
1051  if (!use_hot_team) {
1052 
1053  /* install the master thread */
1054  team->t.t_threads[0] = master_th;
1055  __kmp_initialize_info(master_th, team, 0, master_gtid);
1056 
1057  /* now, install the worker threads */
1058  for (i = 1; i < team->t.t_nproc; i++) {
1059 
1060  /* fork or reallocate a new thread and install it in team */
1061  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1062  team->t.t_threads[i] = thr;
1063  KMP_DEBUG_ASSERT(thr);
1064  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1065  /* align team and thread arrived states */
1066  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1067  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1068  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1069  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1070  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1071  team->t.t_bar[bs_plain_barrier].b_arrived));
1072 #if OMP_40_ENABLED
1073  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1074  thr->th.th_teams_level = master_th->th.th_teams_level;
1075  thr->th.th_teams_size = master_th->th.th_teams_size;
1076 #endif
1077  { // Initialize threads' barrier data.
1078  int b;
1079  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1080  for (b = 0; b < bs_last_barrier; ++b) {
1081  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1082  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1083 #if USE_DEBUGGER
1084  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1085 #endif
1086  }
1087  }
1088  }
1089 
1090 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1091  __kmp_partition_places(team);
1092 #endif
1093  }
1094 
1095 #if OMP_50_ENABLED
1096  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1097  for (i = 0; i < team->t.t_nproc; i++) {
1098  kmp_info_t *thr = team->t.t_threads[i];
1099  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1100  thr->th.th_prev_level != team->t.t_level) {
1101  team->t.t_display_affinity = 1;
1102  break;
1103  }
1104  }
1105  }
1106 #endif
1107 
1108  KMP_MB();
1109 }
1110 
1111 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1112 // Propagate any changes to the floating point control registers out to the team
1113 // We try to avoid unnecessary writes to the relevant cache line in the team
1114 // structure, so we don't make changes unless they are needed.
1115 inline static void propagateFPControl(kmp_team_t *team) {
1116  if (__kmp_inherit_fp_control) {
1117  kmp_int16 x87_fpu_control_word;
1118  kmp_uint32 mxcsr;
1119 
1120  // Get master values of FPU control flags (both X87 and vector)
1121  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1122  __kmp_store_mxcsr(&mxcsr);
1123  mxcsr &= KMP_X86_MXCSR_MASK;
1124 
1125  // There is no point looking at t_fp_control_saved here.
1126  // If it is TRUE, we still have to update the values if they are different
1127  // from those we now have. If it is FALSE we didn't save anything yet, but
1128  // our objective is the same. We have to ensure that the values in the team
1129  // are the same as those we have.
1130  // So, this code achieves what we need whether or not t_fp_control_saved is
1131  // true. By checking whether the value needs updating we avoid unnecessary
1132  // writes that would put the cache-line into a written state, causing all
1133  // threads in the team to have to read it again.
1134  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1135  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1136  // Although we don't use this value, other code in the runtime wants to know
1137  // whether it should restore them. So we must ensure it is correct.
1138  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1139  } else {
1140  // Similarly here. Don't write to this cache-line in the team structure
1141  // unless we have to.
1142  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1143  }
1144 }
1145 
1146 // Do the opposite, setting the hardware registers to the updated values from
1147 // the team.
1148 inline static void updateHWFPControl(kmp_team_t *team) {
1149  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1150  // Only reset the fp control regs if they have been changed in the team.
1151  // the parallel region that we are exiting.
1152  kmp_int16 x87_fpu_control_word;
1153  kmp_uint32 mxcsr;
1154  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1155  __kmp_store_mxcsr(&mxcsr);
1156  mxcsr &= KMP_X86_MXCSR_MASK;
1157 
1158  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1159  __kmp_clear_x87_fpu_status_word();
1160  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1161  }
1162 
1163  if (team->t.t_mxcsr != mxcsr) {
1164  __kmp_load_mxcsr(&team->t.t_mxcsr);
1165  }
1166  }
1167 }
1168 #else
1169 #define propagateFPControl(x) ((void)0)
1170 #define updateHWFPControl(x) ((void)0)
1171 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1172 
1173 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1174  int realloc); // forward declaration
1175 
1176 /* Run a parallel region that has been serialized, so runs only in a team of the
1177  single master thread. */
1178 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1179  kmp_info_t *this_thr;
1180  kmp_team_t *serial_team;
1181 
1182  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1183 
1184  /* Skip all this code for autopar serialized loops since it results in
1185  unacceptable overhead */
1186  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1187  return;
1188 
1189  if (!TCR_4(__kmp_init_parallel))
1190  __kmp_parallel_initialize();
1191 
1192  this_thr = __kmp_threads[global_tid];
1193  serial_team = this_thr->th.th_serial_team;
1194 
1195  /* utilize the serialized team held by this thread */
1196  KMP_DEBUG_ASSERT(serial_team);
1197  KMP_MB();
1198 
1199  if (__kmp_tasking_mode != tskm_immediate_exec) {
1200  KMP_DEBUG_ASSERT(
1201  this_thr->th.th_task_team ==
1202  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1203  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1204  NULL);
1205  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1206  "team %p, new task_team = NULL\n",
1207  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1208  this_thr->th.th_task_team = NULL;
1209  }
1210 
1211 #if OMP_40_ENABLED
1212  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1213  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1214  proc_bind = proc_bind_false;
1215  } else if (proc_bind == proc_bind_default) {
1216  // No proc_bind clause was specified, so use the current value
1217  // of proc-bind-var for this parallel region.
1218  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1219  }
1220  // Reset for next parallel region
1221  this_thr->th.th_set_proc_bind = proc_bind_default;
1222 #endif /* OMP_40_ENABLED */
1223 
1224 #if OMPT_SUPPORT
1225  ompt_data_t ompt_parallel_data = ompt_data_none;
1226  ompt_data_t *implicit_task_data;
1227  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1228  if (ompt_enabled.enabled &&
1229  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1230 
1231  ompt_task_info_t *parent_task_info;
1232  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1233 
1234  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1235  if (ompt_enabled.ompt_callback_parallel_begin) {
1236  int team_size = 1;
1237 
1238  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1239  &(parent_task_info->task_data), &(parent_task_info->frame),
1240  &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1241  codeptr);
1242  }
1243  }
1244 #endif // OMPT_SUPPORT
1245 
1246  if (this_thr->th.th_team != serial_team) {
1247  // Nested level will be an index in the nested nthreads array
1248  int level = this_thr->th.th_team->t.t_level;
1249 
1250  if (serial_team->t.t_serialized) {
1251  /* this serial team was already used
1252  TODO increase performance by making this locks more specific */
1253  kmp_team_t *new_team;
1254 
1255  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1256 
1257  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1258 #if OMPT_SUPPORT
1259  ompt_parallel_data,
1260 #endif
1261 #if OMP_40_ENABLED
1262  proc_bind,
1263 #endif
1264  &this_thr->th.th_current_task->td_icvs,
1265  0 USE_NESTED_HOT_ARG(NULL));
1266  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1267  KMP_ASSERT(new_team);
1268 
1269  /* setup new serialized team and install it */
1270  new_team->t.t_threads[0] = this_thr;
1271  new_team->t.t_parent = this_thr->th.th_team;
1272  serial_team = new_team;
1273  this_thr->th.th_serial_team = serial_team;
1274 
1275  KF_TRACE(
1276  10,
1277  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1278  global_tid, serial_team));
1279 
1280  /* TODO the above breaks the requirement that if we run out of resources,
1281  then we can still guarantee that serialized teams are ok, since we may
1282  need to allocate a new one */
1283  } else {
1284  KF_TRACE(
1285  10,
1286  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1287  global_tid, serial_team));
1288  }
1289 
1290  /* we have to initialize this serial team */
1291  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1292  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1293  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1294  serial_team->t.t_ident = loc;
1295  serial_team->t.t_serialized = 1;
1296  serial_team->t.t_nproc = 1;
1297  serial_team->t.t_parent = this_thr->th.th_team;
1298  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1299  this_thr->th.th_team = serial_team;
1300  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1301 
1302  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1303  this_thr->th.th_current_task));
1304  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1305  this_thr->th.th_current_task->td_flags.executing = 0;
1306 
1307  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1308 
1309  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1310  implicit task for each serialized task represented by
1311  team->t.t_serialized? */
1312  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1313  &this_thr->th.th_current_task->td_parent->td_icvs);
1314 
1315  // Thread value exists in the nested nthreads array for the next nested
1316  // level
1317  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1318  this_thr->th.th_current_task->td_icvs.nproc =
1319  __kmp_nested_nth.nth[level + 1];
1320  }
1321 
1322 #if OMP_40_ENABLED
1323  if (__kmp_nested_proc_bind.used &&
1324  (level + 1 < __kmp_nested_proc_bind.used)) {
1325  this_thr->th.th_current_task->td_icvs.proc_bind =
1326  __kmp_nested_proc_bind.bind_types[level + 1];
1327  }
1328 #endif /* OMP_40_ENABLED */
1329 
1330 #if USE_DEBUGGER
1331  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1332 #endif
1333  this_thr->th.th_info.ds.ds_tid = 0;
1334 
1335  /* set thread cache values */
1336  this_thr->th.th_team_nproc = 1;
1337  this_thr->th.th_team_master = this_thr;
1338  this_thr->th.th_team_serialized = 1;
1339 
1340  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1341  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1342 #if OMP_50_ENABLED
1343  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1344 #endif
1345 
1346  propagateFPControl(serial_team);
1347 
1348  /* check if we need to allocate dispatch buffers stack */
1349  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1350  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1351  serial_team->t.t_dispatch->th_disp_buffer =
1352  (dispatch_private_info_t *)__kmp_allocate(
1353  sizeof(dispatch_private_info_t));
1354  }
1355  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1356 
1357  KMP_MB();
1358 
1359  } else {
1360  /* this serialized team is already being used,
1361  * that's fine, just add another nested level */
1362  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1363  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1364  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1365  ++serial_team->t.t_serialized;
1366  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1367 
1368  // Nested level will be an index in the nested nthreads array
1369  int level = this_thr->th.th_team->t.t_level;
1370  // Thread value exists in the nested nthreads array for the next nested
1371  // level
1372  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1373  this_thr->th.th_current_task->td_icvs.nproc =
1374  __kmp_nested_nth.nth[level + 1];
1375  }
1376  serial_team->t.t_level++;
1377  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1378  "of serial team %p to %d\n",
1379  global_tid, serial_team, serial_team->t.t_level));
1380 
1381  /* allocate/push dispatch buffers stack */
1382  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383  {
1384  dispatch_private_info_t *disp_buffer =
1385  (dispatch_private_info_t *)__kmp_allocate(
1386  sizeof(dispatch_private_info_t));
1387  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1388  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1389  }
1390  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1391 
1392  KMP_MB();
1393  }
1394 #if OMP_40_ENABLED
1395  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1396 #endif
1397 
1398 #if OMP_50_ENABLED
1399  // Perform the display affinity functionality for
1400  // serialized parallel regions
1401  if (__kmp_display_affinity) {
1402  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1403  this_thr->th.th_prev_num_threads != 1) {
1404  // NULL means use the affinity-format-var ICV
1405  __kmp_aux_display_affinity(global_tid, NULL);
1406  this_thr->th.th_prev_level = serial_team->t.t_level;
1407  this_thr->th.th_prev_num_threads = 1;
1408  }
1409  }
1410 #endif
1411 
1412  if (__kmp_env_consistency_check)
1413  __kmp_push_parallel(global_tid, NULL);
1414 #if OMPT_SUPPORT
1415  serial_team->t.ompt_team_info.master_return_address = codeptr;
1416  if (ompt_enabled.enabled &&
1417  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1418  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1419 
1420  ompt_lw_taskteam_t lw_taskteam;
1421  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1422  &ompt_parallel_data, codeptr);
1423 
1424  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1425  // don't use lw_taskteam after linking. content was swaped
1426 
1427  /* OMPT implicit task begin */
1428  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1429  if (ompt_enabled.ompt_callback_implicit_task) {
1430  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1431  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1432  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1433  OMPT_CUR_TASK_INFO(this_thr)
1434  ->thread_num = __kmp_tid_from_gtid(global_tid);
1435  }
1436 
1437  /* OMPT state */
1438  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1439  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1440  }
1441 #endif
1442 }
1443 
1444 /* most of the work for a fork */
1445 /* return true if we really went parallel, false if serialized */
1446 int __kmp_fork_call(ident_t *loc, int gtid,
1447  enum fork_context_e call_context, // Intel, GNU, ...
1448  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1449 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1450 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1451  va_list *ap
1452 #else
1453  va_list ap
1454 #endif
1455  ) {
1456  void **argv;
1457  int i;
1458  int master_tid;
1459  int master_this_cons;
1460  kmp_team_t *team;
1461  kmp_team_t *parent_team;
1462  kmp_info_t *master_th;
1463  kmp_root_t *root;
1464  int nthreads;
1465  int master_active;
1466  int master_set_numthreads;
1467  int level;
1468 #if OMP_40_ENABLED
1469  int active_level;
1470  int teams_level;
1471 #endif
1472 #if KMP_NESTED_HOT_TEAMS
1473  kmp_hot_team_ptr_t **p_hot_teams;
1474 #endif
1475  { // KMP_TIME_BLOCK
1476  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1477  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1478 
1479  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1480  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1481  /* Some systems prefer the stack for the root thread(s) to start with */
1482  /* some gap from the parent stack to prevent false sharing. */
1483  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1484  /* These 2 lines below are so this does not get optimized out */
1485  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1486  __kmp_stkpadding += (short)((kmp_int64)dummy);
1487  }
1488 
1489  /* initialize if needed */
1490  KMP_DEBUG_ASSERT(
1491  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1492  if (!TCR_4(__kmp_init_parallel))
1493  __kmp_parallel_initialize();
1494 
1495  /* setup current data */
1496  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1497  // shutdown
1498  parent_team = master_th->th.th_team;
1499  master_tid = master_th->th.th_info.ds.ds_tid;
1500  master_this_cons = master_th->th.th_local.this_construct;
1501  root = master_th->th.th_root;
1502  master_active = root->r.r_active;
1503  master_set_numthreads = master_th->th.th_set_nproc;
1504 
1505 #if OMPT_SUPPORT
1506  ompt_data_t ompt_parallel_data = ompt_data_none;
1507  ompt_data_t *parent_task_data;
1508  ompt_frame_t *ompt_frame;
1509  ompt_data_t *implicit_task_data;
1510  void *return_address = NULL;
1511 
1512  if (ompt_enabled.enabled) {
1513  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1514  NULL, NULL);
1515  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1516  }
1517 #endif
1518 
1519  // Nested level will be an index in the nested nthreads array
1520  level = parent_team->t.t_level;
1521  // used to launch non-serial teams even if nested is not allowed
1522  active_level = parent_team->t.t_active_level;
1523 #if OMP_40_ENABLED
1524  // needed to check nesting inside the teams
1525  teams_level = master_th->th.th_teams_level;
1526 #endif
1527 #if KMP_NESTED_HOT_TEAMS
1528  p_hot_teams = &master_th->th.th_hot_teams;
1529  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1530  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1531  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1532  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1533  // it is either actual or not needed (when active_level > 0)
1534  (*p_hot_teams)[0].hot_team_nth = 1;
1535  }
1536 #endif
1537 
1538 #if OMPT_SUPPORT
1539  if (ompt_enabled.enabled) {
1540  if (ompt_enabled.ompt_callback_parallel_begin) {
1541  int team_size = master_set_numthreads
1542  ? master_set_numthreads
1543  : get__nproc_2(parent_team, master_tid);
1544  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1545  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1546  OMPT_INVOKER(call_context), return_address);
1547  }
1548  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1549  }
1550 #endif
1551 
1552  master_th->th.th_ident = loc;
1553 
1554 #if OMP_40_ENABLED
1555  if (master_th->th.th_teams_microtask && ap &&
1556  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1557  // AC: This is start of parallel that is nested inside teams construct.
1558  // The team is actual (hot), all workers are ready at the fork barrier.
1559  // No lock needed to initialize the team a bit, then free workers.
1560  parent_team->t.t_ident = loc;
1561  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1562  parent_team->t.t_argc = argc;
1563  argv = (void **)parent_team->t.t_argv;
1564  for (i = argc - 1; i >= 0; --i)
1565 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1566 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1567  *argv++ = va_arg(*ap, void *);
1568 #else
1569  *argv++ = va_arg(ap, void *);
1570 #endif
1571  // Increment our nested depth levels, but not increase the serialization
1572  if (parent_team == master_th->th.th_serial_team) {
1573  // AC: we are in serialized parallel
1574  __kmpc_serialized_parallel(loc, gtid);
1575  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1576  // AC: need this in order enquiry functions work
1577  // correctly, will restore at join time
1578  parent_team->t.t_serialized--;
1579 #if OMPT_SUPPORT
1580  void *dummy;
1581  void **exit_runtime_p;
1582 
1583  ompt_lw_taskteam_t lw_taskteam;
1584 
1585  if (ompt_enabled.enabled) {
1586  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1587  &ompt_parallel_data, return_address);
1588  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1589 
1590  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1591  // don't use lw_taskteam after linking. content was swaped
1592 
1593  /* OMPT implicit task begin */
1594  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1595  if (ompt_enabled.ompt_callback_implicit_task) {
1596  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1597  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1598  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1599  OMPT_CUR_TASK_INFO(master_th)
1600  ->thread_num = __kmp_tid_from_gtid(gtid);
1601  }
1602 
1603  /* OMPT state */
1604  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1605  } else {
1606  exit_runtime_p = &dummy;
1607  }
1608 #endif
1609 
1610  {
1611  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1612  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1613  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1614 #if OMPT_SUPPORT
1615  ,
1616  exit_runtime_p
1617 #endif
1618  );
1619  }
1620 
1621 #if OMPT_SUPPORT
1622  *exit_runtime_p = NULL;
1623  if (ompt_enabled.enabled) {
1624  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1625  if (ompt_enabled.ompt_callback_implicit_task) {
1626  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1627  ompt_scope_end, NULL, implicit_task_data, 1,
1628  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1629  }
1630  __ompt_lw_taskteam_unlink(master_th);
1631 
1632  if (ompt_enabled.ompt_callback_parallel_end) {
1633  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1634  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1635  OMPT_INVOKER(call_context), return_address);
1636  }
1637  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1638  }
1639 #endif
1640  return TRUE;
1641  }
1642 
1643  parent_team->t.t_pkfn = microtask;
1644  parent_team->t.t_invoke = invoker;
1645  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1646  parent_team->t.t_active_level++;
1647  parent_team->t.t_level++;
1648 #if OMP_50_ENABLED
1649  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1650 #endif
1651 
1652  /* Change number of threads in the team if requested */
1653  if (master_set_numthreads) { // The parallel has num_threads clause
1654  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1655  // AC: only can reduce number of threads dynamically, can't increase
1656  kmp_info_t **other_threads = parent_team->t.t_threads;
1657  parent_team->t.t_nproc = master_set_numthreads;
1658  for (i = 0; i < master_set_numthreads; ++i) {
1659  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1660  }
1661  // Keep extra threads hot in the team for possible next parallels
1662  }
1663  master_th->th.th_set_nproc = 0;
1664  }
1665 
1666 #if USE_DEBUGGER
1667  if (__kmp_debugging) { // Let debugger override number of threads.
1668  int nth = __kmp_omp_num_threads(loc);
1669  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1670  master_set_numthreads = nth;
1671  }
1672  }
1673 #endif
1674 
1675  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1676  "master_th=%p, gtid=%d\n",
1677  root, parent_team, master_th, gtid));
1678  __kmp_internal_fork(loc, gtid, parent_team);
1679  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1680  "master_th=%p, gtid=%d\n",
1681  root, parent_team, master_th, gtid));
1682 
1683  /* Invoke microtask for MASTER thread */
1684  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1685  parent_team->t.t_id, parent_team->t.t_pkfn));
1686 
1687  if (!parent_team->t.t_invoke(gtid)) {
1688  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1689  }
1690  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1691  parent_team->t.t_id, parent_team->t.t_pkfn));
1692  KMP_MB(); /* Flush all pending memory write invalidates. */
1693 
1694  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1695 
1696  return TRUE;
1697  } // Parallel closely nested in teams construct
1698 #endif /* OMP_40_ENABLED */
1699 
1700 #if KMP_DEBUG
1701  if (__kmp_tasking_mode != tskm_immediate_exec) {
1702  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1703  parent_team->t.t_task_team[master_th->th.th_task_state]);
1704  }
1705 #endif
1706 
1707  if (parent_team->t.t_active_level >=
1708  master_th->th.th_current_task->td_icvs.max_active_levels) {
1709  nthreads = 1;
1710  } else {
1711 #if OMP_40_ENABLED
1712  int enter_teams = ((ap == NULL && active_level == 0) ||
1713  (ap && teams_level > 0 && teams_level == level));
1714 #endif
1715  nthreads =
1716  master_set_numthreads
1717  ? master_set_numthreads
1718  : get__nproc_2(
1719  parent_team,
1720  master_tid); // TODO: get nproc directly from current task
1721 
1722  // Check if we need to take forkjoin lock? (no need for serialized
1723  // parallel out of teams construct). This code moved here from
1724  // __kmp_reserve_threads() to speedup nested serialized parallels.
1725  if (nthreads > 1) {
1726  if ((!get__nested(master_th) && (root->r.r_in_parallel
1727 #if OMP_40_ENABLED
1728  && !enter_teams
1729 #endif /* OMP_40_ENABLED */
1730  )) ||
1731  (__kmp_library == library_serial)) {
1732  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1733  " threads\n",
1734  gtid, nthreads));
1735  nthreads = 1;
1736  }
1737  }
1738  if (nthreads > 1) {
1739  /* determine how many new threads we can use */
1740  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1741  nthreads = __kmp_reserve_threads(
1742  root, parent_team, master_tid, nthreads
1743 #if OMP_40_ENABLED
1744  /* AC: If we execute teams from parallel region (on host), then
1745  teams should be created but each can only have 1 thread if
1746  nesting is disabled. If teams called from serial region, then
1747  teams and their threads should be created regardless of the
1748  nesting setting. */
1749  ,
1750  enter_teams
1751 #endif /* OMP_40_ENABLED */
1752  );
1753  if (nthreads == 1) {
1754  // Free lock for single thread execution here; for multi-thread
1755  // execution it will be freed later after team of threads created
1756  // and initialized
1757  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1758  }
1759  }
1760  }
1761  KMP_DEBUG_ASSERT(nthreads > 0);
1762 
1763  // If we temporarily changed the set number of threads then restore it now
1764  master_th->th.th_set_nproc = 0;
1765 
1766  /* create a serialized parallel region? */
1767  if (nthreads == 1) {
1768 /* josh todo: hypothetical question: what do we do for OS X*? */
1769 #if KMP_OS_LINUX && \
1770  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1771  void *args[argc];
1772 #else
1773  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1774 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1775  KMP_ARCH_AARCH64) */
1776 
1777  KA_TRACE(20,
1778  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1779 
1780  __kmpc_serialized_parallel(loc, gtid);
1781 
1782  if (call_context == fork_context_intel) {
1783  /* TODO this sucks, use the compiler itself to pass args! :) */
1784  master_th->th.th_serial_team->t.t_ident = loc;
1785 #if OMP_40_ENABLED
1786  if (!ap) {
1787  // revert change made in __kmpc_serialized_parallel()
1788  master_th->th.th_serial_team->t.t_level--;
1789 // Get args from parent team for teams construct
1790 
1791 #if OMPT_SUPPORT
1792  void *dummy;
1793  void **exit_runtime_p;
1794  ompt_task_info_t *task_info;
1795 
1796  ompt_lw_taskteam_t lw_taskteam;
1797 
1798  if (ompt_enabled.enabled) {
1799  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1800  &ompt_parallel_data, return_address);
1801 
1802  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1803  // don't use lw_taskteam after linking. content was swaped
1804 
1805  task_info = OMPT_CUR_TASK_INFO(master_th);
1806  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1807  if (ompt_enabled.ompt_callback_implicit_task) {
1808  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1809  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1810  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1811  OMPT_CUR_TASK_INFO(master_th)
1812  ->thread_num = __kmp_tid_from_gtid(gtid);
1813  }
1814 
1815  /* OMPT state */
1816  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1817  } else {
1818  exit_runtime_p = &dummy;
1819  }
1820 #endif
1821 
1822  {
1823  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1824  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1825  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1826  parent_team->t.t_argv
1827 #if OMPT_SUPPORT
1828  ,
1829  exit_runtime_p
1830 #endif
1831  );
1832  }
1833 
1834 #if OMPT_SUPPORT
1835  if (ompt_enabled.enabled) {
1836  exit_runtime_p = NULL;
1837  if (ompt_enabled.ompt_callback_implicit_task) {
1838  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1839  ompt_scope_end, NULL, &(task_info->task_data), 1,
1840  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1841  }
1842 
1843  __ompt_lw_taskteam_unlink(master_th);
1844  if (ompt_enabled.ompt_callback_parallel_end) {
1845  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1846  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1847  OMPT_INVOKER(call_context), return_address);
1848  }
1849  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1850  }
1851 #endif
1852  } else if (microtask == (microtask_t)__kmp_teams_master) {
1853  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1854  master_th->th.th_serial_team);
1855  team = master_th->th.th_team;
1856  // team->t.t_pkfn = microtask;
1857  team->t.t_invoke = invoker;
1858  __kmp_alloc_argv_entries(argc, team, TRUE);
1859  team->t.t_argc = argc;
1860  argv = (void **)team->t.t_argv;
1861  if (ap) {
1862  for (i = argc - 1; i >= 0; --i)
1863 // TODO: revert workaround for Intel(R) 64 tracker #96
1864 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1865  *argv++ = va_arg(*ap, void *);
1866 #else
1867  *argv++ = va_arg(ap, void *);
1868 #endif
1869  } else {
1870  for (i = 0; i < argc; ++i)
1871  // Get args from parent team for teams construct
1872  argv[i] = parent_team->t.t_argv[i];
1873  }
1874  // AC: revert change made in __kmpc_serialized_parallel()
1875  // because initial code in teams should have level=0
1876  team->t.t_level--;
1877  // AC: call special invoker for outer "parallel" of teams construct
1878  invoker(gtid);
1879  } else {
1880 #endif /* OMP_40_ENABLED */
1881  argv = args;
1882  for (i = argc - 1; i >= 0; --i)
1883 // TODO: revert workaround for Intel(R) 64 tracker #96
1884 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1885  *argv++ = va_arg(*ap, void *);
1886 #else
1887  *argv++ = va_arg(ap, void *);
1888 #endif
1889  KMP_MB();
1890 
1891 #if OMPT_SUPPORT
1892  void *dummy;
1893  void **exit_runtime_p;
1894  ompt_task_info_t *task_info;
1895 
1896  ompt_lw_taskteam_t lw_taskteam;
1897 
1898  if (ompt_enabled.enabled) {
1899  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1900  &ompt_parallel_data, return_address);
1901  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1902  // don't use lw_taskteam after linking. content was swaped
1903  task_info = OMPT_CUR_TASK_INFO(master_th);
1904  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1905 
1906  /* OMPT implicit task begin */
1907  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1908  if (ompt_enabled.ompt_callback_implicit_task) {
1909  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1910  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1911  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1912  OMPT_CUR_TASK_INFO(master_th)
1913  ->thread_num = __kmp_tid_from_gtid(gtid);
1914  }
1915 
1916  /* OMPT state */
1917  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1918  } else {
1919  exit_runtime_p = &dummy;
1920  }
1921 #endif
1922 
1923  {
1924  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1925  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1926  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1927 #if OMPT_SUPPORT
1928  ,
1929  exit_runtime_p
1930 #endif
1931  );
1932  }
1933 
1934 #if OMPT_SUPPORT
1935  if (ompt_enabled.enabled) {
1936  *exit_runtime_p = NULL;
1937  if (ompt_enabled.ompt_callback_implicit_task) {
1938  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1939  ompt_scope_end, NULL, &(task_info->task_data), 1,
1940  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1941  }
1942 
1943  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1944  __ompt_lw_taskteam_unlink(master_th);
1945  if (ompt_enabled.ompt_callback_parallel_end) {
1946  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1947  &ompt_parallel_data, parent_task_data,
1948  OMPT_INVOKER(call_context), return_address);
1949  }
1950  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1951  }
1952 #endif
1953 #if OMP_40_ENABLED
1954  }
1955 #endif /* OMP_40_ENABLED */
1956  } else if (call_context == fork_context_gnu) {
1957 #if OMPT_SUPPORT
1958  ompt_lw_taskteam_t lwt;
1959  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1960  return_address);
1961 
1962  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1963  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1964 // don't use lw_taskteam after linking. content was swaped
1965 #endif
1966 
1967  // we were called from GNU native code
1968  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1969  return FALSE;
1970  } else {
1971  KMP_ASSERT2(call_context < fork_context_last,
1972  "__kmp_fork_call: unknown fork_context parameter");
1973  }
1974 
1975  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1976  KMP_MB();
1977  return FALSE;
1978  } // if (nthreads == 1)
1979 
1980  // GEH: only modify the executing flag in the case when not serialized
1981  // serialized case is handled in kmpc_serialized_parallel
1982  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1983  "curtask=%p, curtask_max_aclevel=%d\n",
1984  parent_team->t.t_active_level, master_th,
1985  master_th->th.th_current_task,
1986  master_th->th.th_current_task->td_icvs.max_active_levels));
1987  // TODO: GEH - cannot do this assertion because root thread not set up as
1988  // executing
1989  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1990  master_th->th.th_current_task->td_flags.executing = 0;
1991 
1992 #if OMP_40_ENABLED
1993  if (!master_th->th.th_teams_microtask || level > teams_level)
1994 #endif /* OMP_40_ENABLED */
1995  {
1996  /* Increment our nested depth level */
1997  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1998  }
1999 
2000  // See if we need to make a copy of the ICVs.
2001  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2002  if ((level + 1 < __kmp_nested_nth.used) &&
2003  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2004  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2005  } else {
2006  nthreads_icv = 0; // don't update
2007  }
2008 
2009 #if OMP_40_ENABLED
2010  // Figure out the proc_bind_policy for the new team.
2011  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2012  kmp_proc_bind_t proc_bind_icv =
2013  proc_bind_default; // proc_bind_default means don't update
2014  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2015  proc_bind = proc_bind_false;
2016  } else {
2017  if (proc_bind == proc_bind_default) {
2018  // No proc_bind clause specified; use current proc-bind-var for this
2019  // parallel region
2020  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2021  }
2022  /* else: The proc_bind policy was specified explicitly on parallel clause.
2023  This overrides proc-bind-var for this parallel region, but does not
2024  change proc-bind-var. */
2025  // Figure the value of proc-bind-var for the child threads.
2026  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2027  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2028  master_th->th.th_current_task->td_icvs.proc_bind)) {
2029  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2030  }
2031  }
2032 
2033  // Reset for next parallel region
2034  master_th->th.th_set_proc_bind = proc_bind_default;
2035 #endif /* OMP_40_ENABLED */
2036 
2037  if ((nthreads_icv > 0)
2038 #if OMP_40_ENABLED
2039  || (proc_bind_icv != proc_bind_default)
2040 #endif /* OMP_40_ENABLED */
2041  ) {
2042  kmp_internal_control_t new_icvs;
2043  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2044  new_icvs.next = NULL;
2045  if (nthreads_icv > 0) {
2046  new_icvs.nproc = nthreads_icv;
2047  }
2048 
2049 #if OMP_40_ENABLED
2050  if (proc_bind_icv != proc_bind_default) {
2051  new_icvs.proc_bind = proc_bind_icv;
2052  }
2053 #endif /* OMP_40_ENABLED */
2054 
2055  /* allocate a new parallel team */
2056  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057  team = __kmp_allocate_team(root, nthreads, nthreads,
2058 #if OMPT_SUPPORT
2059  ompt_parallel_data,
2060 #endif
2061 #if OMP_40_ENABLED
2062  proc_bind,
2063 #endif
2064  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2065  } else {
2066  /* allocate a new parallel team */
2067  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2068  team = __kmp_allocate_team(root, nthreads, nthreads,
2069 #if OMPT_SUPPORT
2070  ompt_parallel_data,
2071 #endif
2072 #if OMP_40_ENABLED
2073  proc_bind,
2074 #endif
2075  &master_th->th.th_current_task->td_icvs,
2076  argc USE_NESTED_HOT_ARG(master_th));
2077  }
2078  KF_TRACE(
2079  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2080 
2081  /* setup the new team */
2082  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2083  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2084  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2085  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2086  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2087 #if OMPT_SUPPORT
2088  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2089  return_address);
2090 #endif
2091  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2092 // TODO: parent_team->t.t_level == INT_MAX ???
2093 #if OMP_40_ENABLED
2094  if (!master_th->th.th_teams_microtask || level > teams_level) {
2095 #endif /* OMP_40_ENABLED */
2096  int new_level = parent_team->t.t_level + 1;
2097  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2098  new_level = parent_team->t.t_active_level + 1;
2099  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2100 #if OMP_40_ENABLED
2101  } else {
2102  // AC: Do not increase parallel level at start of the teams construct
2103  int new_level = parent_team->t.t_level;
2104  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2105  new_level = parent_team->t.t_active_level;
2106  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2107  }
2108 #endif /* OMP_40_ENABLED */
2109  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2110  // set master's schedule as new run-time schedule
2111  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2112 
2113 #if OMP_40_ENABLED
2114  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2115 #endif
2116 #if OMP_50_ENABLED
2117  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2118 #endif
2119 
2120  // Update the floating point rounding in the team if required.
2121  propagateFPControl(team);
2122 
2123  if (__kmp_tasking_mode != tskm_immediate_exec) {
2124  // Set master's task team to team's task team. Unless this is hot team, it
2125  // should be NULL.
2126  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2127  parent_team->t.t_task_team[master_th->th.th_task_state]);
2128  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2129  "%p, new task_team %p / team %p\n",
2130  __kmp_gtid_from_thread(master_th),
2131  master_th->th.th_task_team, parent_team,
2132  team->t.t_task_team[master_th->th.th_task_state], team));
2133 
2134  if (active_level || master_th->th.th_task_team) {
2135  // Take a memo of master's task_state
2136  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2137  if (master_th->th.th_task_state_top >=
2138  master_th->th.th_task_state_stack_sz) { // increase size
2139  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2140  kmp_uint8 *old_stack, *new_stack;
2141  kmp_uint32 i;
2142  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2143  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2144  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2145  }
2146  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2147  ++i) { // zero-init rest of stack
2148  new_stack[i] = 0;
2149  }
2150  old_stack = master_th->th.th_task_state_memo_stack;
2151  master_th->th.th_task_state_memo_stack = new_stack;
2152  master_th->th.th_task_state_stack_sz = new_size;
2153  __kmp_free(old_stack);
2154  }
2155  // Store master's task_state on stack
2156  master_th->th
2157  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2158  master_th->th.th_task_state;
2159  master_th->th.th_task_state_top++;
2160 #if KMP_NESTED_HOT_TEAMS
2161  if (master_th->th.th_hot_teams &&
2162  active_level < __kmp_hot_teams_max_level &&
2163  team == master_th->th.th_hot_teams[active_level].hot_team) {
2164  // Restore master's nested state if nested hot team
2165  master_th->th.th_task_state =
2166  master_th->th
2167  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2168  } else {
2169 #endif
2170  master_th->th.th_task_state = 0;
2171 #if KMP_NESTED_HOT_TEAMS
2172  }
2173 #endif
2174  }
2175 #if !KMP_NESTED_HOT_TEAMS
2176  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2177  (team == root->r.r_hot_team));
2178 #endif
2179  }
2180 
2181  KA_TRACE(
2182  20,
2183  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2184  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2185  team->t.t_nproc));
2186  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2187  (team->t.t_master_tid == 0 &&
2188  (team->t.t_parent == root->r.r_root_team ||
2189  team->t.t_parent->t.t_serialized)));
2190  KMP_MB();
2191 
2192  /* now, setup the arguments */
2193  argv = (void **)team->t.t_argv;
2194 #if OMP_40_ENABLED
2195  if (ap) {
2196 #endif /* OMP_40_ENABLED */
2197  for (i = argc - 1; i >= 0; --i) {
2198 // TODO: revert workaround for Intel(R) 64 tracker #96
2199 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2200  void *new_argv = va_arg(*ap, void *);
2201 #else
2202  void *new_argv = va_arg(ap, void *);
2203 #endif
2204  KMP_CHECK_UPDATE(*argv, new_argv);
2205  argv++;
2206  }
2207 #if OMP_40_ENABLED
2208  } else {
2209  for (i = 0; i < argc; ++i) {
2210  // Get args from parent team for teams construct
2211  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2212  }
2213  }
2214 #endif /* OMP_40_ENABLED */
2215 
2216  /* now actually fork the threads */
2217  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2218  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2219  root->r.r_active = TRUE;
2220 
2221  __kmp_fork_team_threads(root, team, master_th, gtid);
2222  __kmp_setup_icv_copy(team, nthreads,
2223  &master_th->th.th_current_task->td_icvs, loc);
2224 
2225 #if OMPT_SUPPORT
2226  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2227 #endif
2228 
2229  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2230 
2231 #if USE_ITT_BUILD
2232  if (team->t.t_active_level == 1 // only report frames at level 1
2233 #if OMP_40_ENABLED
2234  && !master_th->th.th_teams_microtask // not in teams construct
2235 #endif /* OMP_40_ENABLED */
2236  ) {
2237 #if USE_ITT_NOTIFY
2238  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2239  (__kmp_forkjoin_frames_mode == 3 ||
2240  __kmp_forkjoin_frames_mode == 1)) {
2241  kmp_uint64 tmp_time = 0;
2242  if (__itt_get_timestamp_ptr)
2243  tmp_time = __itt_get_timestamp();
2244  // Internal fork - report frame begin
2245  master_th->th.th_frame_time = tmp_time;
2246  if (__kmp_forkjoin_frames_mode == 3)
2247  team->t.t_region_time = tmp_time;
2248  } else
2249 // only one notification scheme (either "submit" or "forking/joined", not both)
2250 #endif /* USE_ITT_NOTIFY */
2251  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2252  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2253  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2254  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2255  }
2256  }
2257 #endif /* USE_ITT_BUILD */
2258 
2259  /* now go on and do the work */
2260  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2261  KMP_MB();
2262  KF_TRACE(10,
2263  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2264  root, team, master_th, gtid));
2265 
2266 #if USE_ITT_BUILD
2267  if (__itt_stack_caller_create_ptr) {
2268  team->t.t_stack_id =
2269  __kmp_itt_stack_caller_create(); // create new stack stitching id
2270  // before entering fork barrier
2271  }
2272 #endif /* USE_ITT_BUILD */
2273 
2274 #if OMP_40_ENABLED
2275  // AC: skip __kmp_internal_fork at teams construct, let only master
2276  // threads execute
2277  if (ap)
2278 #endif /* OMP_40_ENABLED */
2279  {
2280  __kmp_internal_fork(loc, gtid, team);
2281  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2282  "master_th=%p, gtid=%d\n",
2283  root, team, master_th, gtid));
2284  }
2285 
2286  if (call_context == fork_context_gnu) {
2287  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2288  return TRUE;
2289  }
2290 
2291  /* Invoke microtask for MASTER thread */
2292  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2293  team->t.t_id, team->t.t_pkfn));
2294  } // END of timer KMP_fork_call block
2295 
2296  if (!team->t.t_invoke(gtid)) {
2297  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2298  }
2299  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2300  team->t.t_id, team->t.t_pkfn));
2301  KMP_MB(); /* Flush all pending memory write invalidates. */
2302 
2303  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2304 
2305 #if OMPT_SUPPORT
2306  if (ompt_enabled.enabled) {
2307  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2308  }
2309 #endif
2310 
2311  return TRUE;
2312 }
2313 
2314 #if OMPT_SUPPORT
2315 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2316  kmp_team_t *team) {
2317  // restore state outside the region
2318  thread->th.ompt_thread_info.state =
2319  ((team->t.t_serialized) ? ompt_state_work_serial
2320  : ompt_state_work_parallel);
2321 }
2322 
2323 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2324  kmp_team_t *team, ompt_data_t *parallel_data,
2325  fork_context_e fork_context, void *codeptr) {
2326  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2327  if (ompt_enabled.ompt_callback_parallel_end) {
2328  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2329  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2330  codeptr);
2331  }
2332 
2333  task_info->frame.enter_frame = ompt_data_none;
2334  __kmp_join_restore_state(thread, team);
2335 }
2336 #endif
2337 
2338 void __kmp_join_call(ident_t *loc, int gtid
2339 #if OMPT_SUPPORT
2340  ,
2341  enum fork_context_e fork_context
2342 #endif
2343 #if OMP_40_ENABLED
2344  ,
2345  int exit_teams
2346 #endif /* OMP_40_ENABLED */
2347  ) {
2348  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2349  kmp_team_t *team;
2350  kmp_team_t *parent_team;
2351  kmp_info_t *master_th;
2352  kmp_root_t *root;
2353  int master_active;
2354  int i;
2355 
2356  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2357 
2358  /* setup current data */
2359  master_th = __kmp_threads[gtid];
2360  root = master_th->th.th_root;
2361  team = master_th->th.th_team;
2362  parent_team = team->t.t_parent;
2363 
2364  master_th->th.th_ident = loc;
2365 
2366 #if OMPT_SUPPORT
2367  if (ompt_enabled.enabled) {
2368  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2369  }
2370 #endif
2371 
2372 #if KMP_DEBUG
2373  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2374  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2375  "th_task_team = %p\n",
2376  __kmp_gtid_from_thread(master_th), team,
2377  team->t.t_task_team[master_th->th.th_task_state],
2378  master_th->th.th_task_team));
2379  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2380  team->t.t_task_team[master_th->th.th_task_state]);
2381  }
2382 #endif
2383 
2384  if (team->t.t_serialized) {
2385 #if OMP_40_ENABLED
2386  if (master_th->th.th_teams_microtask) {
2387  // We are in teams construct
2388  int level = team->t.t_level;
2389  int tlevel = master_th->th.th_teams_level;
2390  if (level == tlevel) {
2391  // AC: we haven't incremented it earlier at start of teams construct,
2392  // so do it here - at the end of teams construct
2393  team->t.t_level++;
2394  } else if (level == tlevel + 1) {
2395  // AC: we are exiting parallel inside teams, need to increment
2396  // serialization in order to restore it in the next call to
2397  // __kmpc_end_serialized_parallel
2398  team->t.t_serialized++;
2399  }
2400  }
2401 #endif /* OMP_40_ENABLED */
2402  __kmpc_end_serialized_parallel(loc, gtid);
2403 
2404 #if OMPT_SUPPORT
2405  if (ompt_enabled.enabled) {
2406  __kmp_join_restore_state(master_th, parent_team);
2407  }
2408 #endif
2409 
2410  return;
2411  }
2412 
2413  master_active = team->t.t_master_active;
2414 
2415 #if OMP_40_ENABLED
2416  if (!exit_teams)
2417 #endif /* OMP_40_ENABLED */
2418  {
2419  // AC: No barrier for internal teams at exit from teams construct.
2420  // But there is barrier for external team (league).
2421  __kmp_internal_join(loc, gtid, team);
2422  }
2423 #if OMP_40_ENABLED
2424  else {
2425  master_th->th.th_task_state =
2426  0; // AC: no tasking in teams (out of any parallel)
2427  }
2428 #endif /* OMP_40_ENABLED */
2429 
2430  KMP_MB();
2431 
2432 #if OMPT_SUPPORT
2433  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2434  void *codeptr = team->t.ompt_team_info.master_return_address;
2435 #endif
2436 
2437 #if USE_ITT_BUILD
2438  if (__itt_stack_caller_create_ptr) {
2439  __kmp_itt_stack_caller_destroy(
2440  (__itt_caller)team->t
2441  .t_stack_id); // destroy the stack stitching id after join barrier
2442  }
2443 
2444  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2445  if (team->t.t_active_level == 1
2446 #if OMP_40_ENABLED
2447  && !master_th->th.th_teams_microtask /* not in teams construct */
2448 #endif /* OMP_40_ENABLED */
2449  ) {
2450  master_th->th.th_ident = loc;
2451  // only one notification scheme (either "submit" or "forking/joined", not
2452  // both)
2453  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2454  __kmp_forkjoin_frames_mode == 3)
2455  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2456  master_th->th.th_frame_time, 0, loc,
2457  master_th->th.th_team_nproc, 1);
2458  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2459  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2460  __kmp_itt_region_joined(gtid);
2461  } // active_level == 1
2462 #endif /* USE_ITT_BUILD */
2463 
2464 #if OMP_40_ENABLED
2465  if (master_th->th.th_teams_microtask && !exit_teams &&
2466  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2467  team->t.t_level == master_th->th.th_teams_level + 1) {
2468  // AC: We need to leave the team structure intact at the end of parallel
2469  // inside the teams construct, so that at the next parallel same (hot) team
2470  // works, only adjust nesting levels
2471 
2472  /* Decrement our nested depth level */
2473  team->t.t_level--;
2474  team->t.t_active_level--;
2475  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2476 
2477  /* Restore number of threads in the team if needed */
2478  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2479  int old_num = master_th->th.th_team_nproc;
2480  int new_num = master_th->th.th_teams_size.nth;
2481  kmp_info_t **other_threads = team->t.t_threads;
2482  team->t.t_nproc = new_num;
2483  for (i = 0; i < old_num; ++i) {
2484  other_threads[i]->th.th_team_nproc = new_num;
2485  }
2486  // Adjust states of non-used threads of the team
2487  for (i = old_num; i < new_num; ++i) {
2488  // Re-initialize thread's barrier data.
2489  int b;
2490  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2491  for (b = 0; b < bs_last_barrier; ++b) {
2492  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2493  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2494 #if USE_DEBUGGER
2495  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2496 #endif
2497  }
2498  if (__kmp_tasking_mode != tskm_immediate_exec) {
2499  // Synchronize thread's task state
2500  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2501  }
2502  }
2503  }
2504 
2505 #if OMPT_SUPPORT
2506  if (ompt_enabled.enabled) {
2507  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2508  codeptr);
2509  }
2510 #endif
2511 
2512  return;
2513  }
2514 #endif /* OMP_40_ENABLED */
2515 
2516  /* do cleanup and restore the parent team */
2517  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2518  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2519 
2520  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2521 
2522  /* jc: The following lock has instructions with REL and ACQ semantics,
2523  separating the parallel user code called in this parallel region
2524  from the serial user code called after this function returns. */
2525  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2526 
2527 #if OMP_40_ENABLED
2528  if (!master_th->th.th_teams_microtask ||
2529  team->t.t_level > master_th->th.th_teams_level)
2530 #endif /* OMP_40_ENABLED */
2531  {
2532  /* Decrement our nested depth level */
2533  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2534  }
2535  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2536 
2537 #if OMPT_SUPPORT
2538  if (ompt_enabled.enabled) {
2539  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2540  if (ompt_enabled.ompt_callback_implicit_task) {
2541  int ompt_team_size = team->t.t_nproc;
2542  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2543  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2544  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2545  }
2546 
2547  task_info->frame.exit_frame = ompt_data_none;
2548  task_info->task_data = ompt_data_none;
2549  }
2550 #endif
2551 
2552  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2553  master_th, team));
2554  __kmp_pop_current_task_from_thread(master_th);
2555 
2556 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2557  // Restore master thread's partition.
2558  master_th->th.th_first_place = team->t.t_first_place;
2559  master_th->th.th_last_place = team->t.t_last_place;
2560 #endif /* OMP_40_ENABLED */
2561 #if OMP_50_ENABLED
2562  master_th->th.th_def_allocator = team->t.t_def_allocator;
2563 #endif
2564 
2565  updateHWFPControl(team);
2566 
2567  if (root->r.r_active != master_active)
2568  root->r.r_active = master_active;
2569 
2570  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2571  master_th)); // this will free worker threads
2572 
2573  /* this race was fun to find. make sure the following is in the critical
2574  region otherwise assertions may fail occasionally since the old team may be
2575  reallocated and the hierarchy appears inconsistent. it is actually safe to
2576  run and won't cause any bugs, but will cause those assertion failures. it's
2577  only one deref&assign so might as well put this in the critical region */
2578  master_th->th.th_team = parent_team;
2579  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2580  master_th->th.th_team_master = parent_team->t.t_threads[0];
2581  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2582 
2583  /* restore serialized team, if need be */
2584  if (parent_team->t.t_serialized &&
2585  parent_team != master_th->th.th_serial_team &&
2586  parent_team != root->r.r_root_team) {
2587  __kmp_free_team(root,
2588  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2589  master_th->th.th_serial_team = parent_team;
2590  }
2591 
2592  if (__kmp_tasking_mode != tskm_immediate_exec) {
2593  if (master_th->th.th_task_state_top >
2594  0) { // Restore task state from memo stack
2595  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2596  // Remember master's state if we re-use this nested hot team
2597  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2598  master_th->th.th_task_state;
2599  --master_th->th.th_task_state_top; // pop
2600  // Now restore state at this level
2601  master_th->th.th_task_state =
2602  master_th->th
2603  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2604  }
2605  // Copy the task team from the parent team to the master thread
2606  master_th->th.th_task_team =
2607  parent_team->t.t_task_team[master_th->th.th_task_state];
2608  KA_TRACE(20,
2609  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2610  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2611  parent_team));
2612  }
2613 
2614  // TODO: GEH - cannot do this assertion because root thread not set up as
2615  // executing
2616  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2617  master_th->th.th_current_task->td_flags.executing = 1;
2618 
2619  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2620 
2621 #if OMPT_SUPPORT
2622  if (ompt_enabled.enabled) {
2623  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2624  codeptr);
2625  }
2626 #endif
2627 
2628  KMP_MB();
2629  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2630 }
2631 
2632 /* Check whether we should push an internal control record onto the
2633  serial team stack. If so, do it. */
2634 void __kmp_save_internal_controls(kmp_info_t *thread) {
2635 
2636  if (thread->th.th_team != thread->th.th_serial_team) {
2637  return;
2638  }
2639  if (thread->th.th_team->t.t_serialized > 1) {
2640  int push = 0;
2641 
2642  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2643  push = 1;
2644  } else {
2645  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2646  thread->th.th_team->t.t_serialized) {
2647  push = 1;
2648  }
2649  }
2650  if (push) { /* push a record on the serial team's stack */
2651  kmp_internal_control_t *control =
2652  (kmp_internal_control_t *)__kmp_allocate(
2653  sizeof(kmp_internal_control_t));
2654 
2655  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2656 
2657  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2658 
2659  control->next = thread->th.th_team->t.t_control_stack_top;
2660  thread->th.th_team->t.t_control_stack_top = control;
2661  }
2662  }
2663 }
2664 
2665 /* Changes set_nproc */
2666 void __kmp_set_num_threads(int new_nth, int gtid) {
2667  kmp_info_t *thread;
2668  kmp_root_t *root;
2669 
2670  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2671  KMP_DEBUG_ASSERT(__kmp_init_serial);
2672 
2673  if (new_nth < 1)
2674  new_nth = 1;
2675  else if (new_nth > __kmp_max_nth)
2676  new_nth = __kmp_max_nth;
2677 
2678  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2679  thread = __kmp_threads[gtid];
2680  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2681  return; // nothing to do
2682 
2683  __kmp_save_internal_controls(thread);
2684 
2685  set__nproc(thread, new_nth);
2686 
2687  // If this omp_set_num_threads() call will cause the hot team size to be
2688  // reduced (in the absence of a num_threads clause), then reduce it now,
2689  // rather than waiting for the next parallel region.
2690  root = thread->th.th_root;
2691  if (__kmp_init_parallel && (!root->r.r_active) &&
2692  (root->r.r_hot_team->t.t_nproc > new_nth)
2693 #if KMP_NESTED_HOT_TEAMS
2694  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2695 #endif
2696  ) {
2697  kmp_team_t *hot_team = root->r.r_hot_team;
2698  int f;
2699 
2700  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2701 
2702  // Release the extra threads we don't need any more.
2703  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2704  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2705  if (__kmp_tasking_mode != tskm_immediate_exec) {
2706  // When decreasing team size, threads no longer in the team should unref
2707  // task team.
2708  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2709  }
2710  __kmp_free_thread(hot_team->t.t_threads[f]);
2711  hot_team->t.t_threads[f] = NULL;
2712  }
2713  hot_team->t.t_nproc = new_nth;
2714 #if KMP_NESTED_HOT_TEAMS
2715  if (thread->th.th_hot_teams) {
2716  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2717  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2718  }
2719 #endif
2720 
2721  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2722 
2723  // Update the t_nproc field in the threads that are still active.
2724  for (f = 0; f < new_nth; f++) {
2725  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2726  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2727  }
2728  // Special flag in case omp_set_num_threads() call
2729  hot_team->t.t_size_changed = -1;
2730  }
2731 }
2732 
2733 /* Changes max_active_levels */
2734 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2735  kmp_info_t *thread;
2736 
2737  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2738  "%d = (%d)\n",
2739  gtid, max_active_levels));
2740  KMP_DEBUG_ASSERT(__kmp_init_serial);
2741 
2742  // validate max_active_levels
2743  if (max_active_levels < 0) {
2744  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2745  // We ignore this call if the user has specified a negative value.
2746  // The current setting won't be changed. The last valid setting will be
2747  // used. A warning will be issued (if warnings are allowed as controlled by
2748  // the KMP_WARNINGS env var).
2749  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2750  "max_active_levels for thread %d = (%d)\n",
2751  gtid, max_active_levels));
2752  return;
2753  }
2754  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2755  // it's OK, the max_active_levels is within the valid range: [ 0;
2756  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2757  // We allow a zero value. (implementation defined behavior)
2758  } else {
2759  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2760  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2761  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2762  // Current upper limit is MAX_INT. (implementation defined behavior)
2763  // If the input exceeds the upper limit, we correct the input to be the
2764  // upper limit. (implementation defined behavior)
2765  // Actually, the flow should never get here until we use MAX_INT limit.
2766  }
2767  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2768  "max_active_levels for thread %d = (%d)\n",
2769  gtid, max_active_levels));
2770 
2771  thread = __kmp_threads[gtid];
2772 
2773  __kmp_save_internal_controls(thread);
2774 
2775  set__max_active_levels(thread, max_active_levels);
2776 }
2777 
2778 /* Gets max_active_levels */
2779 int __kmp_get_max_active_levels(int gtid) {
2780  kmp_info_t *thread;
2781 
2782  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2783  KMP_DEBUG_ASSERT(__kmp_init_serial);
2784 
2785  thread = __kmp_threads[gtid];
2786  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2787  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2788  "curtask_maxaclevel=%d\n",
2789  gtid, thread->th.th_current_task,
2790  thread->th.th_current_task->td_icvs.max_active_levels));
2791  return thread->th.th_current_task->td_icvs.max_active_levels;
2792 }
2793 
2794 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2795 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2796  kmp_info_t *thread;
2797  // kmp_team_t *team;
2798 
2799  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2800  gtid, (int)kind, chunk));
2801  KMP_DEBUG_ASSERT(__kmp_init_serial);
2802 
2803  // Check if the kind parameter is valid, correct if needed.
2804  // Valid parameters should fit in one of two intervals - standard or extended:
2805  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2806  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2807  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2808  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2809  // TODO: Hint needs attention in case we change the default schedule.
2810  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2811  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2812  __kmp_msg_null);
2813  kind = kmp_sched_default;
2814  chunk = 0; // ignore chunk value in case of bad kind
2815  }
2816 
2817  thread = __kmp_threads[gtid];
2818 
2819  __kmp_save_internal_controls(thread);
2820 
2821  if (kind < kmp_sched_upper_std) {
2822  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2823  // differ static chunked vs. unchunked: chunk should be invalid to
2824  // indicate unchunked schedule (which is the default)
2825  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2826  } else {
2827  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2828  __kmp_sch_map[kind - kmp_sched_lower - 1];
2829  }
2830  } else {
2831  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2832  // kmp_sched_lower - 2 ];
2833  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2834  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2835  kmp_sched_lower - 2];
2836  }
2837  if (kind == kmp_sched_auto || chunk < 1) {
2838  // ignore parameter chunk for schedule auto
2839  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2840  } else {
2841  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2842  }
2843 }
2844 
2845 /* Gets def_sched_var ICV values */
2846 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2847  kmp_info_t *thread;
2848  enum sched_type th_type;
2849 
2850  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2851  KMP_DEBUG_ASSERT(__kmp_init_serial);
2852 
2853  thread = __kmp_threads[gtid];
2854 
2855  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2856 
2857  switch (th_type) {
2858  case kmp_sch_static:
2859  case kmp_sch_static_greedy:
2860  case kmp_sch_static_balanced:
2861  *kind = kmp_sched_static;
2862  *chunk = 0; // chunk was not set, try to show this fact via zero value
2863  return;
2864  case kmp_sch_static_chunked:
2865  *kind = kmp_sched_static;
2866  break;
2867  case kmp_sch_dynamic_chunked:
2868  *kind = kmp_sched_dynamic;
2869  break;
2871  case kmp_sch_guided_iterative_chunked:
2872  case kmp_sch_guided_analytical_chunked:
2873  *kind = kmp_sched_guided;
2874  break;
2875  case kmp_sch_auto:
2876  *kind = kmp_sched_auto;
2877  break;
2878  case kmp_sch_trapezoidal:
2879  *kind = kmp_sched_trapezoidal;
2880  break;
2881 #if KMP_STATIC_STEAL_ENABLED
2882  case kmp_sch_static_steal:
2883  *kind = kmp_sched_static_steal;
2884  break;
2885 #endif
2886  default:
2887  KMP_FATAL(UnknownSchedulingType, th_type);
2888  }
2889 
2890  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2891 }
2892 
2893 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2894 
2895  int ii, dd;
2896  kmp_team_t *team;
2897  kmp_info_t *thr;
2898 
2899  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2900  KMP_DEBUG_ASSERT(__kmp_init_serial);
2901 
2902  // validate level
2903  if (level == 0)
2904  return 0;
2905  if (level < 0)
2906  return -1;
2907  thr = __kmp_threads[gtid];
2908  team = thr->th.th_team;
2909  ii = team->t.t_level;
2910  if (level > ii)
2911  return -1;
2912 
2913 #if OMP_40_ENABLED
2914  if (thr->th.th_teams_microtask) {
2915  // AC: we are in teams region where multiple nested teams have same level
2916  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2917  if (level <=
2918  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2919  KMP_DEBUG_ASSERT(ii >= tlevel);
2920  // AC: As we need to pass by the teams league, we need to artificially
2921  // increase ii
2922  if (ii == tlevel) {
2923  ii += 2; // three teams have same level
2924  } else {
2925  ii++; // two teams have same level
2926  }
2927  }
2928  }
2929 #endif
2930 
2931  if (ii == level)
2932  return __kmp_tid_from_gtid(gtid);
2933 
2934  dd = team->t.t_serialized;
2935  level++;
2936  while (ii > level) {
2937  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2938  }
2939  if ((team->t.t_serialized) && (!dd)) {
2940  team = team->t.t_parent;
2941  continue;
2942  }
2943  if (ii > level) {
2944  team = team->t.t_parent;
2945  dd = team->t.t_serialized;
2946  ii--;
2947  }
2948  }
2949 
2950  return (dd > 1) ? (0) : (team->t.t_master_tid);
2951 }
2952 
2953 int __kmp_get_team_size(int gtid, int level) {
2954 
2955  int ii, dd;
2956  kmp_team_t *team;
2957  kmp_info_t *thr;
2958 
2959  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2960  KMP_DEBUG_ASSERT(__kmp_init_serial);
2961 
2962  // validate level
2963  if (level == 0)
2964  return 1;
2965  if (level < 0)
2966  return -1;
2967  thr = __kmp_threads[gtid];
2968  team = thr->th.th_team;
2969  ii = team->t.t_level;
2970  if (level > ii)
2971  return -1;
2972 
2973 #if OMP_40_ENABLED
2974  if (thr->th.th_teams_microtask) {
2975  // AC: we are in teams region where multiple nested teams have same level
2976  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2977  if (level <=
2978  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2979  KMP_DEBUG_ASSERT(ii >= tlevel);
2980  // AC: As we need to pass by the teams league, we need to artificially
2981  // increase ii
2982  if (ii == tlevel) {
2983  ii += 2; // three teams have same level
2984  } else {
2985  ii++; // two teams have same level
2986  }
2987  }
2988  }
2989 #endif
2990 
2991  while (ii > level) {
2992  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2993  }
2994  if (team->t.t_serialized && (!dd)) {
2995  team = team->t.t_parent;
2996  continue;
2997  }
2998  if (ii > level) {
2999  team = team->t.t_parent;
3000  ii--;
3001  }
3002  }
3003 
3004  return team->t.t_nproc;
3005 }
3006 
3007 kmp_r_sched_t __kmp_get_schedule_global() {
3008  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3009  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3010  // independently. So one can get the updated schedule here.
3011 
3012  kmp_r_sched_t r_sched;
3013 
3014  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3015  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3016  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3017  // different roots (even in OMP 2.5)
3018  if (__kmp_sched == kmp_sch_static) {
3019  // replace STATIC with more detailed schedule (balanced or greedy)
3020  r_sched.r_sched_type = __kmp_static;
3021  } else if (__kmp_sched == kmp_sch_guided_chunked) {
3022  // replace GUIDED with more detailed schedule (iterative or analytical)
3023  r_sched.r_sched_type = __kmp_guided;
3024  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3025  r_sched.r_sched_type = __kmp_sched;
3026  }
3027 
3028  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3029  // __kmp_chunk may be wrong here (if it was not ever set)
3030  r_sched.chunk = KMP_DEFAULT_CHUNK;
3031  } else {
3032  r_sched.chunk = __kmp_chunk;
3033  }
3034 
3035  return r_sched;
3036 }
3037 
3038 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3039  at least argc number of *t_argv entries for the requested team. */
3040 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3041 
3042  KMP_DEBUG_ASSERT(team);
3043  if (!realloc || argc > team->t.t_max_argc) {
3044 
3045  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3046  "current entries=%d\n",
3047  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3048  /* if previously allocated heap space for args, free them */
3049  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3050  __kmp_free((void *)team->t.t_argv);
3051 
3052  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3053  /* use unused space in the cache line for arguments */
3054  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3055  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3056  "argv entries\n",
3057  team->t.t_id, team->t.t_max_argc));
3058  team->t.t_argv = &team->t.t_inline_argv[0];
3059  if (__kmp_storage_map) {
3060  __kmp_print_storage_map_gtid(
3061  -1, &team->t.t_inline_argv[0],
3062  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3063  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3064  team->t.t_id);
3065  }
3066  } else {
3067  /* allocate space for arguments in the heap */
3068  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3069  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3070  : 2 * argc;
3071  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3072  "argv entries\n",
3073  team->t.t_id, team->t.t_max_argc));
3074  team->t.t_argv =
3075  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3076  if (__kmp_storage_map) {
3077  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3078  &team->t.t_argv[team->t.t_max_argc],
3079  sizeof(void *) * team->t.t_max_argc,
3080  "team_%d.t_argv", team->t.t_id);
3081  }
3082  }
3083  }
3084 }
3085 
3086 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3087  int i;
3088  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3089  team->t.t_threads =
3090  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3091  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3092  sizeof(dispatch_shared_info_t) * num_disp_buff);
3093  team->t.t_dispatch =
3094  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3095  team->t.t_implicit_task_taskdata =
3096  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3097  team->t.t_max_nproc = max_nth;
3098 
3099  /* setup dispatch buffers */
3100  for (i = 0; i < num_disp_buff; ++i) {
3101  team->t.t_disp_buffer[i].buffer_index = i;
3102 #if OMP_45_ENABLED
3103  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3104 #endif
3105  }
3106 }
3107 
3108 static void __kmp_free_team_arrays(kmp_team_t *team) {
3109  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3110  int i;
3111  for (i = 0; i < team->t.t_max_nproc; ++i) {
3112  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3113  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3114  team->t.t_dispatch[i].th_disp_buffer = NULL;
3115  }
3116  }
3117 #if KMP_USE_HIER_SCHED
3118  __kmp_dispatch_free_hierarchies(team);
3119 #endif
3120  __kmp_free(team->t.t_threads);
3121  __kmp_free(team->t.t_disp_buffer);
3122  __kmp_free(team->t.t_dispatch);
3123  __kmp_free(team->t.t_implicit_task_taskdata);
3124  team->t.t_threads = NULL;
3125  team->t.t_disp_buffer = NULL;
3126  team->t.t_dispatch = NULL;
3127  team->t.t_implicit_task_taskdata = 0;
3128 }
3129 
3130 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3131  kmp_info_t **oldThreads = team->t.t_threads;
3132 
3133  __kmp_free(team->t.t_disp_buffer);
3134  __kmp_free(team->t.t_dispatch);
3135  __kmp_free(team->t.t_implicit_task_taskdata);
3136  __kmp_allocate_team_arrays(team, max_nth);
3137 
3138  KMP_MEMCPY(team->t.t_threads, oldThreads,
3139  team->t.t_nproc * sizeof(kmp_info_t *));
3140 
3141  __kmp_free(oldThreads);
3142 }
3143 
3144 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3145 
3146  kmp_r_sched_t r_sched =
3147  __kmp_get_schedule_global(); // get current state of scheduling globals
3148 
3149 #if OMP_40_ENABLED
3150  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3151 #endif /* OMP_40_ENABLED */
3152 
3153  kmp_internal_control_t g_icvs = {
3154  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3155  (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3156  // for nested parallelism (per thread)
3157  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3158  // adjustment of threads (per thread)
3159  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3160  // whether blocktime is explicitly set
3161  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3162 #if KMP_USE_MONITOR
3163  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3164 // intervals
3165 #endif
3166  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3167  // next parallel region (per thread)
3168  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3169  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3170  // for max_active_levels
3171  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3172 // {sched,chunk} pair
3173 #if OMP_40_ENABLED
3174  __kmp_nested_proc_bind.bind_types[0],
3175  __kmp_default_device,
3176 #endif /* OMP_40_ENABLED */
3177  NULL // struct kmp_internal_control *next;
3178  };
3179 
3180  return g_icvs;
3181 }
3182 
3183 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3184 
3185  kmp_internal_control_t gx_icvs;
3186  gx_icvs.serial_nesting_level =
3187  0; // probably =team->t.t_serial like in save_inter_controls
3188  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3189  gx_icvs.next = NULL;
3190 
3191  return gx_icvs;
3192 }
3193 
3194 static void __kmp_initialize_root(kmp_root_t *root) {
3195  int f;
3196  kmp_team_t *root_team;
3197  kmp_team_t *hot_team;
3198  int hot_team_max_nth;
3199  kmp_r_sched_t r_sched =
3200  __kmp_get_schedule_global(); // get current state of scheduling globals
3201  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3202  KMP_DEBUG_ASSERT(root);
3203  KMP_ASSERT(!root->r.r_begin);
3204 
3205  /* setup the root state structure */
3206  __kmp_init_lock(&root->r.r_begin_lock);
3207  root->r.r_begin = FALSE;
3208  root->r.r_active = FALSE;
3209  root->r.r_in_parallel = 0;
3210  root->r.r_blocktime = __kmp_dflt_blocktime;
3211  root->r.r_nested = __kmp_dflt_nested;
3212  root->r.r_cg_nthreads = 1;
3213 
3214  /* setup the root team for this task */
3215  /* allocate the root team structure */
3216  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3217 
3218  root_team =
3219  __kmp_allocate_team(root,
3220  1, // new_nproc
3221  1, // max_nproc
3222 #if OMPT_SUPPORT
3223  ompt_data_none, // root parallel id
3224 #endif
3225 #if OMP_40_ENABLED
3226  __kmp_nested_proc_bind.bind_types[0],
3227 #endif
3228  &r_icvs,
3229  0 // argc
3230  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3231  );
3232 #if USE_DEBUGGER
3233  // Non-NULL value should be assigned to make the debugger display the root
3234  // team.
3235  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3236 #endif
3237 
3238  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3239 
3240  root->r.r_root_team = root_team;
3241  root_team->t.t_control_stack_top = NULL;
3242 
3243  /* initialize root team */
3244  root_team->t.t_threads[0] = NULL;
3245  root_team->t.t_nproc = 1;
3246  root_team->t.t_serialized = 1;
3247  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3248  root_team->t.t_sched.sched = r_sched.sched;
3249  KA_TRACE(
3250  20,
3251  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3252  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3253 
3254  /* setup the hot team for this task */
3255  /* allocate the hot team structure */
3256  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3257 
3258  hot_team =
3259  __kmp_allocate_team(root,
3260  1, // new_nproc
3261  __kmp_dflt_team_nth_ub * 2, // max_nproc
3262 #if OMPT_SUPPORT
3263  ompt_data_none, // root parallel id
3264 #endif
3265 #if OMP_40_ENABLED
3266  __kmp_nested_proc_bind.bind_types[0],
3267 #endif
3268  &r_icvs,
3269  0 // argc
3270  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3271  );
3272  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3273 
3274  root->r.r_hot_team = hot_team;
3275  root_team->t.t_control_stack_top = NULL;
3276 
3277  /* first-time initialization */
3278  hot_team->t.t_parent = root_team;
3279 
3280  /* initialize hot team */
3281  hot_team_max_nth = hot_team->t.t_max_nproc;
3282  for (f = 0; f < hot_team_max_nth; ++f) {
3283  hot_team->t.t_threads[f] = NULL;
3284  }
3285  hot_team->t.t_nproc = 1;
3286  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3287  hot_team->t.t_sched.sched = r_sched.sched;
3288  hot_team->t.t_size_changed = 0;
3289 }
3290 
3291 #ifdef KMP_DEBUG
3292 
3293 typedef struct kmp_team_list_item {
3294  kmp_team_p const *entry;
3295  struct kmp_team_list_item *next;
3296 } kmp_team_list_item_t;
3297 typedef kmp_team_list_item_t *kmp_team_list_t;
3298 
3299 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3300  kmp_team_list_t list, // List of teams.
3301  kmp_team_p const *team // Team to add.
3302  ) {
3303 
3304  // List must terminate with item where both entry and next are NULL.
3305  // Team is added to the list only once.
3306  // List is sorted in ascending order by team id.
3307  // Team id is *not* a key.
3308 
3309  kmp_team_list_t l;
3310 
3311  KMP_DEBUG_ASSERT(list != NULL);
3312  if (team == NULL) {
3313  return;
3314  }
3315 
3316  __kmp_print_structure_team_accum(list, team->t.t_parent);
3317  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3318 
3319  // Search list for the team.
3320  l = list;
3321  while (l->next != NULL && l->entry != team) {
3322  l = l->next;
3323  }
3324  if (l->next != NULL) {
3325  return; // Team has been added before, exit.
3326  }
3327 
3328  // Team is not found. Search list again for insertion point.
3329  l = list;
3330  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3331  l = l->next;
3332  }
3333 
3334  // Insert team.
3335  {
3336  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3337  sizeof(kmp_team_list_item_t));
3338  *item = *l;
3339  l->entry = team;
3340  l->next = item;
3341  }
3342 }
3343 
3344 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3345 
3346  ) {
3347  __kmp_printf("%s", title);
3348  if (team != NULL) {
3349  __kmp_printf("%2x %p\n", team->t.t_id, team);
3350  } else {
3351  __kmp_printf(" - (nil)\n");
3352  }
3353 }
3354 
3355 static void __kmp_print_structure_thread(char const *title,
3356  kmp_info_p const *thread) {
3357  __kmp_printf("%s", title);
3358  if (thread != NULL) {
3359  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3360  } else {
3361  __kmp_printf(" - (nil)\n");
3362  }
3363 }
3364 
3365 void __kmp_print_structure(void) {
3366 
3367  kmp_team_list_t list;
3368 
3369  // Initialize list of teams.
3370  list =
3371  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3372  list->entry = NULL;
3373  list->next = NULL;
3374 
3375  __kmp_printf("\n------------------------------\nGlobal Thread "
3376  "Table\n------------------------------\n");
3377  {
3378  int gtid;
3379  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3380  __kmp_printf("%2d", gtid);
3381  if (__kmp_threads != NULL) {
3382  __kmp_printf(" %p", __kmp_threads[gtid]);
3383  }
3384  if (__kmp_root != NULL) {
3385  __kmp_printf(" %p", __kmp_root[gtid]);
3386  }
3387  __kmp_printf("\n");
3388  }
3389  }
3390 
3391  // Print out __kmp_threads array.
3392  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3393  "----------\n");
3394  if (__kmp_threads != NULL) {
3395  int gtid;
3396  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3397  kmp_info_t const *thread = __kmp_threads[gtid];
3398  if (thread != NULL) {
3399  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3400  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3401  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3402  __kmp_print_structure_team(" Serial Team: ",
3403  thread->th.th_serial_team);
3404  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3405  __kmp_print_structure_thread(" Master: ",
3406  thread->th.th_team_master);
3407  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3408  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3409 #if OMP_40_ENABLED
3410  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3411 #endif
3412  __kmp_print_structure_thread(" Next in pool: ",
3413  thread->th.th_next_pool);
3414  __kmp_printf("\n");
3415  __kmp_print_structure_team_accum(list, thread->th.th_team);
3416  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3417  }
3418  }
3419  } else {
3420  __kmp_printf("Threads array is not allocated.\n");
3421  }
3422 
3423  // Print out __kmp_root array.
3424  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3425  "--------\n");
3426  if (__kmp_root != NULL) {
3427  int gtid;
3428  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3429  kmp_root_t const *root = __kmp_root[gtid];
3430  if (root != NULL) {
3431  __kmp_printf("GTID %2d %p:\n", gtid, root);
3432  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3433  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3434  __kmp_print_structure_thread(" Uber Thread: ",
3435  root->r.r_uber_thread);
3436  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3437  __kmp_printf(" Nested?: %2d\n", root->r.r_nested);
3438  __kmp_printf(" In Parallel: %2d\n",
3439  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3440  __kmp_printf("\n");
3441  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3442  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3443  }
3444  }
3445  } else {
3446  __kmp_printf("Ubers array is not allocated.\n");
3447  }
3448 
3449  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3450  "--------\n");
3451  while (list->next != NULL) {
3452  kmp_team_p const *team = list->entry;
3453  int i;
3454  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3455  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3456  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3457  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3458  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3459  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3460  for (i = 0; i < team->t.t_nproc; ++i) {
3461  __kmp_printf(" Thread %2d: ", i);
3462  __kmp_print_structure_thread("", team->t.t_threads[i]);
3463  }
3464  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3465  __kmp_printf("\n");
3466  list = list->next;
3467  }
3468 
3469  // Print out __kmp_thread_pool and __kmp_team_pool.
3470  __kmp_printf("\n------------------------------\nPools\n----------------------"
3471  "--------\n");
3472  __kmp_print_structure_thread("Thread pool: ",
3473  CCAST(kmp_info_t *, __kmp_thread_pool));
3474  __kmp_print_structure_team("Team pool: ",
3475  CCAST(kmp_team_t *, __kmp_team_pool));
3476  __kmp_printf("\n");
3477 
3478  // Free team list.
3479  while (list != NULL) {
3480  kmp_team_list_item_t *item = list;
3481  list = list->next;
3482  KMP_INTERNAL_FREE(item);
3483  }
3484 }
3485 
3486 #endif
3487 
3488 //---------------------------------------------------------------------------
3489 // Stuff for per-thread fast random number generator
3490 // Table of primes
3491 static const unsigned __kmp_primes[] = {
3492  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3493  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3494  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3495  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3496  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3497  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3498  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3499  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3500  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3501  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3502  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3503 
3504 //---------------------------------------------------------------------------
3505 // __kmp_get_random: Get a random number using a linear congruential method.
3506 unsigned short __kmp_get_random(kmp_info_t *thread) {
3507  unsigned x = thread->th.th_x;
3508  unsigned short r = x >> 16;
3509 
3510  thread->th.th_x = x * thread->th.th_a + 1;
3511 
3512  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3513  thread->th.th_info.ds.ds_tid, r));
3514 
3515  return r;
3516 }
3517 //--------------------------------------------------------
3518 // __kmp_init_random: Initialize a random number generator
3519 void __kmp_init_random(kmp_info_t *thread) {
3520  unsigned seed = thread->th.th_info.ds.ds_tid;
3521 
3522  thread->th.th_a =
3523  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3524  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3525  KA_TRACE(30,
3526  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3527 }
3528 
3529 #if KMP_OS_WINDOWS
3530 /* reclaim array entries for root threads that are already dead, returns number
3531  * reclaimed */
3532 static int __kmp_reclaim_dead_roots(void) {
3533  int i, r = 0;
3534 
3535  for (i = 0; i < __kmp_threads_capacity; ++i) {
3536  if (KMP_UBER_GTID(i) &&
3537  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3538  !__kmp_root[i]
3539  ->r.r_active) { // AC: reclaim only roots died in non-active state
3540  r += __kmp_unregister_root_other_thread(i);
3541  }
3542  }
3543  return r;
3544 }
3545 #endif
3546 
3547 /* This function attempts to create free entries in __kmp_threads and
3548  __kmp_root, and returns the number of free entries generated.
3549 
3550  For Windows* OS static library, the first mechanism used is to reclaim array
3551  entries for root threads that are already dead.
3552 
3553  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3554  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3555  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3556  threadprivate cache array has been created. Synchronization with
3557  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3558 
3559  After any dead root reclamation, if the clipping value allows array expansion
3560  to result in the generation of a total of nNeed free slots, the function does
3561  that expansion. If not, nothing is done beyond the possible initial root
3562  thread reclamation.
3563 
3564  If any argument is negative, the behavior is undefined. */
3565 static int __kmp_expand_threads(int nNeed) {
3566  int added = 0;
3567  int minimumRequiredCapacity;
3568  int newCapacity;
3569  kmp_info_t **newThreads;
3570  kmp_root_t **newRoot;
3571 
3572 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3573 // resizing __kmp_threads does not need additional protection if foreign
3574 // threads are present
3575 
3576 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3577  /* only for Windows static library */
3578  /* reclaim array entries for root threads that are already dead */
3579  added = __kmp_reclaim_dead_roots();
3580 
3581  if (nNeed) {
3582  nNeed -= added;
3583  if (nNeed < 0)
3584  nNeed = 0;
3585  }
3586 #endif
3587  if (nNeed <= 0)
3588  return added;
3589 
3590  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3591  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3592  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3593  // > __kmp_max_nth in one of two ways:
3594  //
3595  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3596  // may not be resused by another thread, so we may need to increase
3597  // __kmp_threads_capacity to __kmp_max_nth + 1.
3598  //
3599  // 2) New foreign root(s) are encountered. We always register new foreign
3600  // roots. This may cause a smaller # of threads to be allocated at
3601  // subsequent parallel regions, but the worker threads hang around (and
3602  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3603  //
3604  // Anyway, that is the reason for moving the check to see if
3605  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3606  // instead of having it performed here. -BB
3607 
3608  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3609 
3610  /* compute expansion headroom to check if we can expand */
3611  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3612  /* possible expansion too small -- give up */
3613  return added;
3614  }
3615  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3616 
3617  newCapacity = __kmp_threads_capacity;
3618  do {
3619  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3620  : __kmp_sys_max_nth;
3621  } while (newCapacity < minimumRequiredCapacity);
3622  newThreads = (kmp_info_t **)__kmp_allocate(
3623  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3624  newRoot =
3625  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3626  KMP_MEMCPY(newThreads, __kmp_threads,
3627  __kmp_threads_capacity * sizeof(kmp_info_t *));
3628  KMP_MEMCPY(newRoot, __kmp_root,
3629  __kmp_threads_capacity * sizeof(kmp_root_t *));
3630 
3631  kmp_info_t **temp_threads = __kmp_threads;
3632  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3633  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3634  __kmp_free(temp_threads);
3635  added += newCapacity - __kmp_threads_capacity;
3636  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3637 
3638  if (newCapacity > __kmp_tp_capacity) {
3639  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3640  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3641  __kmp_threadprivate_resize_cache(newCapacity);
3642  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3643  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3644  }
3645  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3646  }
3647 
3648  return added;
3649 }
3650 
3651 /* Register the current thread as a root thread and obtain our gtid. We must
3652  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3653  thread that calls from __kmp_do_serial_initialize() */
3654 int __kmp_register_root(int initial_thread) {
3655  kmp_info_t *root_thread;
3656  kmp_root_t *root;
3657  int gtid;
3658  int capacity;
3659  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3660  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3661  KMP_MB();
3662 
3663  /* 2007-03-02:
3664  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3665  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3666  work as expected -- it may return false (that means there is at least one
3667  empty slot in __kmp_threads array), but it is possible the only free slot
3668  is #0, which is reserved for initial thread and so cannot be used for this
3669  one. Following code workarounds this bug.
3670 
3671  However, right solution seems to be not reserving slot #0 for initial
3672  thread because:
3673  (1) there is no magic in slot #0,
3674  (2) we cannot detect initial thread reliably (the first thread which does
3675  serial initialization may be not a real initial thread).
3676  */
3677  capacity = __kmp_threads_capacity;
3678  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3679  --capacity;
3680  }
3681 
3682  /* see if there are too many threads */
3683  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3684  if (__kmp_tp_cached) {
3685  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3686  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3687  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3688  } else {
3689  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3690  __kmp_msg_null);
3691  }
3692  }
3693 
3694  /* find an available thread slot */
3695  /* Don't reassign the zero slot since we need that to only be used by initial
3696  thread */
3697  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3698  gtid++)
3699  ;
3700  KA_TRACE(1,
3701  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3702  KMP_ASSERT(gtid < __kmp_threads_capacity);
3703 
3704  /* update global accounting */
3705  __kmp_all_nth++;
3706  TCW_4(__kmp_nth, __kmp_nth + 1);
3707 
3708  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3709  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3710  if (__kmp_adjust_gtid_mode) {
3711  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3712  if (TCR_4(__kmp_gtid_mode) != 2) {
3713  TCW_4(__kmp_gtid_mode, 2);
3714  }
3715  } else {
3716  if (TCR_4(__kmp_gtid_mode) != 1) {
3717  TCW_4(__kmp_gtid_mode, 1);
3718  }
3719  }
3720  }
3721 
3722 #ifdef KMP_ADJUST_BLOCKTIME
3723  /* Adjust blocktime to zero if necessary */
3724  /* Middle initialization might not have occurred yet */
3725  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3726  if (__kmp_nth > __kmp_avail_proc) {
3727  __kmp_zero_bt = TRUE;
3728  }
3729  }
3730 #endif /* KMP_ADJUST_BLOCKTIME */
3731 
3732  /* setup this new hierarchy */
3733  if (!(root = __kmp_root[gtid])) {
3734  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3735  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3736  }
3737 
3738 #if KMP_STATS_ENABLED
3739  // Initialize stats as soon as possible (right after gtid assignment).
3740  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3741  __kmp_stats_thread_ptr->startLife();
3742  KMP_SET_THREAD_STATE(SERIAL_REGION);
3743  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3744 #endif
3745  __kmp_initialize_root(root);
3746 
3747  /* setup new root thread structure */
3748  if (root->r.r_uber_thread) {
3749  root_thread = root->r.r_uber_thread;
3750  } else {
3751  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3752  if (__kmp_storage_map) {
3753  __kmp_print_thread_storage_map(root_thread, gtid);
3754  }
3755  root_thread->th.th_info.ds.ds_gtid = gtid;
3756 #if OMPT_SUPPORT
3757  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3758 #endif
3759  root_thread->th.th_root = root;
3760  if (__kmp_env_consistency_check) {
3761  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3762  }
3763 #if USE_FAST_MEMORY
3764  __kmp_initialize_fast_memory(root_thread);
3765 #endif /* USE_FAST_MEMORY */
3766 
3767 #if KMP_USE_BGET
3768  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3769  __kmp_initialize_bget(root_thread);
3770 #endif
3771  __kmp_init_random(root_thread); // Initialize random number generator
3772  }
3773 
3774  /* setup the serial team held in reserve by the root thread */
3775  if (!root_thread->th.th_serial_team) {
3776  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3777  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3778  root_thread->th.th_serial_team =
3779  __kmp_allocate_team(root, 1, 1,
3780 #if OMPT_SUPPORT
3781  ompt_data_none, // root parallel id
3782 #endif
3783 #if OMP_40_ENABLED
3784  proc_bind_default,
3785 #endif
3786  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3787  }
3788  KMP_ASSERT(root_thread->th.th_serial_team);
3789  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3790  root_thread->th.th_serial_team));
3791 
3792  /* drop root_thread into place */
3793  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3794 
3795  root->r.r_root_team->t.t_threads[0] = root_thread;
3796  root->r.r_hot_team->t.t_threads[0] = root_thread;
3797  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3798  // AC: the team created in reserve, not for execution (it is unused for now).
3799  root_thread->th.th_serial_team->t.t_serialized = 0;
3800  root->r.r_uber_thread = root_thread;
3801 
3802  /* initialize the thread, get it ready to go */
3803  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3804  TCW_4(__kmp_init_gtid, TRUE);
3805 
3806  /* prepare the master thread for get_gtid() */
3807  __kmp_gtid_set_specific(gtid);
3808 
3809 #if USE_ITT_BUILD
3810  __kmp_itt_thread_name(gtid);
3811 #endif /* USE_ITT_BUILD */
3812 
3813 #ifdef KMP_TDATA_GTID
3814  __kmp_gtid = gtid;
3815 #endif
3816  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3817  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3818 
3819  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3820  "plain=%u\n",
3821  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3822  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3823  KMP_INIT_BARRIER_STATE));
3824  { // Initialize barrier data.
3825  int b;
3826  for (b = 0; b < bs_last_barrier; ++b) {
3827  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3828 #if USE_DEBUGGER
3829  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3830 #endif
3831  }
3832  }
3833  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3834  KMP_INIT_BARRIER_STATE);
3835 
3836 #if KMP_AFFINITY_SUPPORTED
3837 #if OMP_40_ENABLED
3838  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3839  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3840  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3841  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3842 #endif
3843  if (TCR_4(__kmp_init_middle)) {
3844  __kmp_affinity_set_init_mask(gtid, TRUE);
3845  }
3846 #endif /* KMP_AFFINITY_SUPPORTED */
3847 #if OMP_50_ENABLED
3848  root_thread->th.th_def_allocator = __kmp_def_allocator;
3849  root_thread->th.th_prev_level = 0;
3850  root_thread->th.th_prev_num_threads = 1;
3851 #endif
3852 
3853  __kmp_root_counter++;
3854 
3855 #if OMPT_SUPPORT
3856  if (!initial_thread && ompt_enabled.enabled) {
3857 
3858  kmp_info_t *root_thread = ompt_get_thread();
3859 
3860  ompt_set_thread_state(root_thread, ompt_state_overhead);
3861 
3862  if (ompt_enabled.ompt_callback_thread_begin) {
3863  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3864  ompt_thread_initial, __ompt_get_thread_data_internal());
3865  }
3866  ompt_data_t *task_data;
3867  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3868  if (ompt_enabled.ompt_callback_task_create) {
3869  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3870  NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3871  // initial task has nothing to return to
3872  }
3873 
3874  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3875  }
3876 #endif
3877 
3878  KMP_MB();
3879  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3880 
3881  return gtid;
3882 }
3883 
3884 #if KMP_NESTED_HOT_TEAMS
3885 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3886  const int max_level) {
3887  int i, n, nth;
3888  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3889  if (!hot_teams || !hot_teams[level].hot_team) {
3890  return 0;
3891  }
3892  KMP_DEBUG_ASSERT(level < max_level);
3893  kmp_team_t *team = hot_teams[level].hot_team;
3894  nth = hot_teams[level].hot_team_nth;
3895  n = nth - 1; // master is not freed
3896  if (level < max_level - 1) {
3897  for (i = 0; i < nth; ++i) {
3898  kmp_info_t *th = team->t.t_threads[i];
3899  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3900  if (i > 0 && th->th.th_hot_teams) {
3901  __kmp_free(th->th.th_hot_teams);
3902  th->th.th_hot_teams = NULL;
3903  }
3904  }
3905  }
3906  __kmp_free_team(root, team, NULL);
3907  return n;
3908 }
3909 #endif
3910 
3911 // Resets a root thread and clear its root and hot teams.
3912 // Returns the number of __kmp_threads entries directly and indirectly freed.
3913 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3914  kmp_team_t *root_team = root->r.r_root_team;
3915  kmp_team_t *hot_team = root->r.r_hot_team;
3916  int n = hot_team->t.t_nproc;
3917  int i;
3918 
3919  KMP_DEBUG_ASSERT(!root->r.r_active);
3920 
3921  root->r.r_root_team = NULL;
3922  root->r.r_hot_team = NULL;
3923  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3924  // before call to __kmp_free_team().
3925  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3926 #if KMP_NESTED_HOT_TEAMS
3927  if (__kmp_hot_teams_max_level >
3928  0) { // need to free nested hot teams and their threads if any
3929  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3930  kmp_info_t *th = hot_team->t.t_threads[i];
3931  if (__kmp_hot_teams_max_level > 1) {
3932  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3933  }
3934  if (th->th.th_hot_teams) {
3935  __kmp_free(th->th.th_hot_teams);
3936  th->th.th_hot_teams = NULL;
3937  }
3938  }
3939  }
3940 #endif
3941  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3942 
3943  // Before we can reap the thread, we need to make certain that all other
3944  // threads in the teams that had this root as ancestor have stopped trying to
3945  // steal tasks.
3946  if (__kmp_tasking_mode != tskm_immediate_exec) {
3947  __kmp_wait_to_unref_task_teams();
3948  }
3949 
3950 #if KMP_OS_WINDOWS
3951  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3952  KA_TRACE(
3953  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3954  "\n",
3955  (LPVOID) & (root->r.r_uber_thread->th),
3956  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3957  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3958 #endif /* KMP_OS_WINDOWS */
3959 
3960 #if OMPT_SUPPORT
3961  if (ompt_enabled.ompt_callback_thread_end) {
3962  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3963  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3964  }
3965 #endif
3966 
3967  TCW_4(__kmp_nth,
3968  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3969  root->r.r_cg_nthreads--;
3970 
3971  __kmp_reap_thread(root->r.r_uber_thread, 1);
3972 
3973  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3974  // of freeing.
3975  root->r.r_uber_thread = NULL;
3976  /* mark root as no longer in use */
3977  root->r.r_begin = FALSE;
3978 
3979  return n;
3980 }
3981 
3982 void __kmp_unregister_root_current_thread(int gtid) {
3983  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3984  /* this lock should be ok, since unregister_root_current_thread is never
3985  called during an abort, only during a normal close. furthermore, if you
3986  have the forkjoin lock, you should never try to get the initz lock */
3987  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3988  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3989  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3990  "exiting T#%d\n",
3991  gtid));
3992  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3993  return;
3994  }
3995  kmp_root_t *root = __kmp_root[gtid];
3996 
3997  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3998  KMP_ASSERT(KMP_UBER_GTID(gtid));
3999  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4000  KMP_ASSERT(root->r.r_active == FALSE);
4001 
4002  KMP_MB();
4003 
4004 #if OMP_45_ENABLED
4005  kmp_info_t *thread = __kmp_threads[gtid];
4006  kmp_team_t *team = thread->th.th_team;
4007  kmp_task_team_t *task_team = thread->th.th_task_team;
4008 
4009  // we need to wait for the proxy tasks before finishing the thread
4010  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4011 #if OMPT_SUPPORT
4012  // the runtime is shutting down so we won't report any events
4013  thread->th.ompt_thread_info.state = ompt_state_undefined;
4014 #endif
4015  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4016  }
4017 #endif
4018 
4019  __kmp_reset_root(gtid, root);
4020 
4021  /* free up this thread slot */
4022  __kmp_gtid_set_specific(KMP_GTID_DNE);
4023 #ifdef KMP_TDATA_GTID
4024  __kmp_gtid = KMP_GTID_DNE;
4025 #endif
4026 
4027  KMP_MB();
4028  KC_TRACE(10,
4029  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4030 
4031  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4032 }
4033 
4034 #if KMP_OS_WINDOWS
4035 /* __kmp_forkjoin_lock must be already held
4036  Unregisters a root thread that is not the current thread. Returns the number
4037  of __kmp_threads entries freed as a result. */
4038 static int __kmp_unregister_root_other_thread(int gtid) {
4039  kmp_root_t *root = __kmp_root[gtid];
4040  int r;
4041 
4042  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4043  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4044  KMP_ASSERT(KMP_UBER_GTID(gtid));
4045  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4046  KMP_ASSERT(root->r.r_active == FALSE);
4047 
4048  r = __kmp_reset_root(gtid, root);
4049  KC_TRACE(10,
4050  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4051  return r;
4052 }
4053 #endif
4054 
4055 #if KMP_DEBUG
4056 void __kmp_task_info() {
4057 
4058  kmp_int32 gtid = __kmp_entry_gtid();
4059  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4060  kmp_info_t *this_thr = __kmp_threads[gtid];
4061  kmp_team_t *steam = this_thr->th.th_serial_team;
4062  kmp_team_t *team = this_thr->th.th_team;
4063 
4064  __kmp_printf(
4065  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4066  "ptask=%p\n",
4067  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4068  team->t.t_implicit_task_taskdata[tid].td_parent);
4069 }
4070 #endif // KMP_DEBUG
4071 
4072 /* TODO optimize with one big memclr, take out what isn't needed, split
4073  responsibility to workers as much as possible, and delay initialization of
4074  features as much as possible */
4075 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4076  int tid, int gtid) {
4077  /* this_thr->th.th_info.ds.ds_gtid is setup in
4078  kmp_allocate_thread/create_worker.
4079  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4080  kmp_info_t *master = team->t.t_threads[0];
4081  KMP_DEBUG_ASSERT(this_thr != NULL);
4082  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4083  KMP_DEBUG_ASSERT(team);
4084  KMP_DEBUG_ASSERT(team->t.t_threads);
4085  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4086  KMP_DEBUG_ASSERT(master);
4087  KMP_DEBUG_ASSERT(master->th.th_root);
4088 
4089  KMP_MB();
4090 
4091  TCW_SYNC_PTR(this_thr->th.th_team, team);
4092 
4093  this_thr->th.th_info.ds.ds_tid = tid;
4094  this_thr->th.th_set_nproc = 0;
4095  if (__kmp_tasking_mode != tskm_immediate_exec)
4096  // When tasking is possible, threads are not safe to reap until they are
4097  // done tasking; this will be set when tasking code is exited in wait
4098  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4099  else // no tasking --> always safe to reap
4100  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4101 #if OMP_40_ENABLED
4102  this_thr->th.th_set_proc_bind = proc_bind_default;
4103 #if KMP_AFFINITY_SUPPORTED
4104  this_thr->th.th_new_place = this_thr->th.th_current_place;
4105 #endif
4106 #endif
4107  this_thr->th.th_root = master->th.th_root;
4108 
4109  /* setup the thread's cache of the team structure */
4110  this_thr->th.th_team_nproc = team->t.t_nproc;
4111  this_thr->th.th_team_master = master;
4112  this_thr->th.th_team_serialized = team->t.t_serialized;
4113  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4114 
4115  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4116 
4117  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4118  tid, gtid, this_thr, this_thr->th.th_current_task));
4119 
4120  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4121  team, tid, TRUE);
4122 
4123  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4124  tid, gtid, this_thr, this_thr->th.th_current_task));
4125  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4126  // __kmp_initialize_team()?
4127 
4128  /* TODO no worksharing in speculative threads */
4129  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4130 
4131  this_thr->th.th_local.this_construct = 0;
4132 
4133  if (!this_thr->th.th_pri_common) {
4134  this_thr->th.th_pri_common =
4135  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4136  if (__kmp_storage_map) {
4137  __kmp_print_storage_map_gtid(
4138  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4139  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4140  }
4141  this_thr->th.th_pri_head = NULL;
4142  }
4143 
4144  /* Initialize dynamic dispatch */
4145  {
4146  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4147  // Use team max_nproc since this will never change for the team.
4148  size_t disp_size =
4149  sizeof(dispatch_private_info_t) *
4150  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4151  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4152  team->t.t_max_nproc));
4153  KMP_ASSERT(dispatch);
4154  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4155  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4156 
4157  dispatch->th_disp_index = 0;
4158 #if OMP_45_ENABLED
4159  dispatch->th_doacross_buf_idx = 0;
4160 #endif
4161  if (!dispatch->th_disp_buffer) {
4162  dispatch->th_disp_buffer =
4163  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4164 
4165  if (__kmp_storage_map) {
4166  __kmp_print_storage_map_gtid(
4167  gtid, &dispatch->th_disp_buffer[0],
4168  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4169  ? 1
4170  : __kmp_dispatch_num_buffers],
4171  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4172  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4173  gtid, team->t.t_id, gtid);
4174  }
4175  } else {
4176  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4177  }
4178 
4179  dispatch->th_dispatch_pr_current = 0;
4180  dispatch->th_dispatch_sh_current = 0;
4181 
4182  dispatch->th_deo_fcn = 0; /* ORDERED */
4183  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4184  }
4185 
4186  this_thr->th.th_next_pool = NULL;
4187 
4188  if (!this_thr->th.th_task_state_memo_stack) {
4189  size_t i;
4190  this_thr->th.th_task_state_memo_stack =
4191  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4192  this_thr->th.th_task_state_top = 0;
4193  this_thr->th.th_task_state_stack_sz = 4;
4194  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4195  ++i) // zero init the stack
4196  this_thr->th.th_task_state_memo_stack[i] = 0;
4197  }
4198 
4199  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4200  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4201 
4202  KMP_MB();
4203 }
4204 
4205 /* allocate a new thread for the requesting team. this is only called from
4206  within a forkjoin critical section. we will first try to get an available
4207  thread from the thread pool. if none is available, we will fork a new one
4208  assuming we are able to create a new one. this should be assured, as the
4209  caller should check on this first. */
4210 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4211  int new_tid) {
4212  kmp_team_t *serial_team;
4213  kmp_info_t *new_thr;
4214  int new_gtid;
4215 
4216  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4217  KMP_DEBUG_ASSERT(root && team);
4218 #if !KMP_NESTED_HOT_TEAMS
4219  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4220 #endif
4221  KMP_MB();
4222 
4223  /* first, try to get one from the thread pool */
4224  if (__kmp_thread_pool) {
4225 
4226  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4227  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4228  if (new_thr == __kmp_thread_pool_insert_pt) {
4229  __kmp_thread_pool_insert_pt = NULL;
4230  }
4231  TCW_4(new_thr->th.th_in_pool, FALSE);
4232  // Don't touch th_active_in_pool or th_active.
4233  // The worker thread adjusts those flags as it sleeps/awakens.
4234  __kmp_thread_pool_nth--;
4235 
4236  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4237  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4238  KMP_ASSERT(!new_thr->th.th_team);
4239  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4240  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4241 
4242  /* setup the thread structure */
4243  __kmp_initialize_info(new_thr, team, new_tid,
4244  new_thr->th.th_info.ds.ds_gtid);
4245  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4246 
4247  TCW_4(__kmp_nth, __kmp_nth + 1);
4248  root->r.r_cg_nthreads++;
4249 
4250  new_thr->th.th_task_state = 0;
4251  new_thr->th.th_task_state_top = 0;
4252  new_thr->th.th_task_state_stack_sz = 4;
4253 
4254 #ifdef KMP_ADJUST_BLOCKTIME
4255  /* Adjust blocktime back to zero if necessary */
4256  /* Middle initialization might not have occurred yet */
4257  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4258  if (__kmp_nth > __kmp_avail_proc) {
4259  __kmp_zero_bt = TRUE;
4260  }
4261  }
4262 #endif /* KMP_ADJUST_BLOCKTIME */
4263 
4264 #if KMP_DEBUG
4265  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4266  // KMP_BARRIER_PARENT_FLAG.
4267  int b;
4268  kmp_balign_t *balign = new_thr->th.th_bar;
4269  for (b = 0; b < bs_last_barrier; ++b)
4270  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4271 #endif
4272 
4273  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4274  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4275 
4276  KMP_MB();
4277  return new_thr;
4278  }
4279 
4280  /* no, well fork a new one */
4281  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4282  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4283 
4284 #if KMP_USE_MONITOR
4285  // If this is the first worker thread the RTL is creating, then also
4286  // launch the monitor thread. We try to do this as early as possible.
4287  if (!TCR_4(__kmp_init_monitor)) {
4288  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4289  if (!TCR_4(__kmp_init_monitor)) {
4290  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4291  TCW_4(__kmp_init_monitor, 1);
4292  __kmp_create_monitor(&__kmp_monitor);
4293  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4294 #if KMP_OS_WINDOWS
4295  // AC: wait until monitor has started. This is a fix for CQ232808.
4296  // The reason is that if the library is loaded/unloaded in a loop with
4297  // small (parallel) work in between, then there is high probability that
4298  // monitor thread started after the library shutdown. At shutdown it is
4299  // too late to cope with the problem, because when the master is in
4300  // DllMain (process detach) the monitor has no chances to start (it is
4301  // blocked), and master has no means to inform the monitor that the
4302  // library has gone, because all the memory which the monitor can access
4303  // is going to be released/reset.
4304  while (TCR_4(__kmp_init_monitor) < 2) {
4305  KMP_YIELD(TRUE);
4306  }
4307  KF_TRACE(10, ("after monitor thread has started\n"));
4308 #endif
4309  }
4310  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4311  }
4312 #endif
4313 
4314  KMP_MB();
4315  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4316  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4317  }
4318 
4319  /* allocate space for it. */
4320  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4321 
4322  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4323 
4324  if (__kmp_storage_map) {
4325  __kmp_print_thread_storage_map(new_thr, new_gtid);
4326  }
4327 
4328  // add the reserve serialized team, initialized from the team's master thread
4329  {
4330  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4331  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4332  new_thr->th.th_serial_team = serial_team =
4333  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4334 #if OMPT_SUPPORT
4335  ompt_data_none, // root parallel id
4336 #endif
4337 #if OMP_40_ENABLED
4338  proc_bind_default,
4339 #endif
4340  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4341  }
4342  KMP_ASSERT(serial_team);
4343  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4344  // execution (it is unused for now).
4345  serial_team->t.t_threads[0] = new_thr;
4346  KF_TRACE(10,
4347  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4348  new_thr));
4349 
4350  /* setup the thread structures */
4351  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4352 
4353 #if USE_FAST_MEMORY
4354  __kmp_initialize_fast_memory(new_thr);
4355 #endif /* USE_FAST_MEMORY */
4356 
4357 #if KMP_USE_BGET
4358  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4359  __kmp_initialize_bget(new_thr);
4360 #endif
4361 
4362  __kmp_init_random(new_thr); // Initialize random number generator
4363 
4364  /* Initialize these only once when thread is grabbed for a team allocation */
4365  KA_TRACE(20,
4366  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4367  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4368 
4369  int b;
4370  kmp_balign_t *balign = new_thr->th.th_bar;
4371  for (b = 0; b < bs_last_barrier; ++b) {
4372  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4373  balign[b].bb.team = NULL;
4374  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4375  balign[b].bb.use_oncore_barrier = 0;
4376  }
4377 
4378  new_thr->th.th_spin_here = FALSE;
4379  new_thr->th.th_next_waiting = 0;
4380 #if KMP_OS_UNIX
4381  new_thr->th.th_blocking = false;
4382 #endif
4383 
4384 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4385  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4386  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4387  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4388  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4389 #endif
4390 #if OMP_50_ENABLED
4391  new_thr->th.th_def_allocator = __kmp_def_allocator;
4392  new_thr->th.th_prev_level = 0;
4393  new_thr->th.th_prev_num_threads = 1;
4394 #endif
4395 
4396  TCW_4(new_thr->th.th_in_pool, FALSE);
4397  new_thr->th.th_active_in_pool = FALSE;
4398  TCW_4(new_thr->th.th_active, TRUE);
4399 
4400  /* adjust the global counters */
4401  __kmp_all_nth++;
4402  __kmp_nth++;
4403 
4404  root->r.r_cg_nthreads++;
4405 
4406  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4407  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4408  if (__kmp_adjust_gtid_mode) {
4409  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4410  if (TCR_4(__kmp_gtid_mode) != 2) {
4411  TCW_4(__kmp_gtid_mode, 2);
4412  }
4413  } else {
4414  if (TCR_4(__kmp_gtid_mode) != 1) {
4415  TCW_4(__kmp_gtid_mode, 1);
4416  }
4417  }
4418  }
4419 
4420 #ifdef KMP_ADJUST_BLOCKTIME
4421  /* Adjust blocktime back to zero if necessary */
4422  /* Middle initialization might not have occurred yet */
4423  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4424  if (__kmp_nth > __kmp_avail_proc) {
4425  __kmp_zero_bt = TRUE;
4426  }
4427  }
4428 #endif /* KMP_ADJUST_BLOCKTIME */
4429 
4430  /* actually fork it and create the new worker thread */
4431  KF_TRACE(
4432  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4433  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4434  KF_TRACE(10,
4435  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4436 
4437  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4438  new_gtid));
4439  KMP_MB();
4440  return new_thr;
4441 }
4442 
4443 /* Reinitialize team for reuse.
4444  The hot team code calls this case at every fork barrier, so EPCC barrier
4445  test are extremely sensitive to changes in it, esp. writes to the team
4446  struct, which cause a cache invalidation in all threads.
4447  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4448 static void __kmp_reinitialize_team(kmp_team_t *team,
4449  kmp_internal_control_t *new_icvs,
4450  ident_t *loc) {
4451  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4452  team->t.t_threads[0], team));
4453  KMP_DEBUG_ASSERT(team && new_icvs);
4454  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4455  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4456 
4457  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4458  // Copy ICVs to the master thread's implicit taskdata
4459  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4460  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4461 
4462  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4463  team->t.t_threads[0], team));
4464 }
4465 
4466 /* Initialize the team data structure.
4467  This assumes the t_threads and t_max_nproc are already set.
4468  Also, we don't touch the arguments */
4469 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4470  kmp_internal_control_t *new_icvs,
4471  ident_t *loc) {
4472  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4473 
4474  /* verify */
4475  KMP_DEBUG_ASSERT(team);
4476  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4477  KMP_DEBUG_ASSERT(team->t.t_threads);
4478  KMP_MB();
4479 
4480  team->t.t_master_tid = 0; /* not needed */
4481  /* team->t.t_master_bar; not needed */
4482  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4483  team->t.t_nproc = new_nproc;
4484 
4485  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4486  team->t.t_next_pool = NULL;
4487  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4488  * up hot team */
4489 
4490  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4491  team->t.t_invoke = NULL; /* not needed */
4492 
4493  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4494  team->t.t_sched.sched = new_icvs->sched.sched;
4495 
4496 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4497  team->t.t_fp_control_saved = FALSE; /* not needed */
4498  team->t.t_x87_fpu_control_word = 0; /* not needed */
4499  team->t.t_mxcsr = 0; /* not needed */
4500 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4501 
4502  team->t.t_construct = 0;
4503 
4504  team->t.t_ordered.dt.t_value = 0;
4505  team->t.t_master_active = FALSE;
4506 
4507  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4508 
4509 #ifdef KMP_DEBUG
4510  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4511 #endif
4512 #if KMP_OS_WINDOWS
4513  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4514 #endif
4515 
4516  team->t.t_control_stack_top = NULL;
4517 
4518  __kmp_reinitialize_team(team, new_icvs, loc);
4519 
4520  KMP_MB();
4521  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4522 }
4523 
4524 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4525 /* Sets full mask for thread and returns old mask, no changes to structures. */
4526 static void
4527 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4528  if (KMP_AFFINITY_CAPABLE()) {
4529  int status;
4530  if (old_mask != NULL) {
4531  status = __kmp_get_system_affinity(old_mask, TRUE);
4532  int error = errno;
4533  if (status != 0) {
4534  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4535  __kmp_msg_null);
4536  }
4537  }
4538  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4539  }
4540 }
4541 #endif
4542 
4543 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4544 
4545 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4546 // It calculats the worker + master thread's partition based upon the parent
4547 // thread's partition, and binds each worker to a thread in their partition.
4548 // The master thread's partition should already include its current binding.
4549 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4550  // Copy the master thread's place partion to the team struct
4551  kmp_info_t *master_th = team->t.t_threads[0];
4552  KMP_DEBUG_ASSERT(master_th != NULL);
4553  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4554  int first_place = master_th->th.th_first_place;
4555  int last_place = master_th->th.th_last_place;
4556  int masters_place = master_th->th.th_current_place;
4557  team->t.t_first_place = first_place;
4558  team->t.t_last_place = last_place;
4559 
4560  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4561  "bound to place %d partition = [%d,%d]\n",
4562  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4563  team->t.t_id, masters_place, first_place, last_place));
4564 
4565  switch (proc_bind) {
4566 
4567  case proc_bind_default:
4568  // serial teams might have the proc_bind policy set to proc_bind_default. It
4569  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4570  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4571  break;
4572 
4573  case proc_bind_master: {
4574  int f;
4575  int n_th = team->t.t_nproc;
4576  for (f = 1; f < n_th; f++) {
4577  kmp_info_t *th = team->t.t_threads[f];
4578  KMP_DEBUG_ASSERT(th != NULL);
4579  th->th.th_first_place = first_place;
4580  th->th.th_last_place = last_place;
4581  th->th.th_new_place = masters_place;
4582 #if OMP_50_ENABLED
4583  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4584  team->t.t_display_affinity != 1) {
4585  team->t.t_display_affinity = 1;
4586  }
4587 #endif
4588 
4589  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4590  "partition = [%d,%d]\n",
4591  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4592  f, masters_place, first_place, last_place));
4593  }
4594  } break;
4595 
4596  case proc_bind_close: {
4597  int f;
4598  int n_th = team->t.t_nproc;
4599  int n_places;
4600  if (first_place <= last_place) {
4601  n_places = last_place - first_place + 1;
4602  } else {
4603  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4604  }
4605  if (n_th <= n_places) {
4606  int place = masters_place;
4607  for (f = 1; f < n_th; f++) {
4608  kmp_info_t *th = team->t.t_threads[f];
4609  KMP_DEBUG_ASSERT(th != NULL);
4610 
4611  if (place == last_place) {
4612  place = first_place;
4613  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4614  place = 0;
4615  } else {
4616  place++;
4617  }
4618  th->th.th_first_place = first_place;
4619  th->th.th_last_place = last_place;
4620  th->th.th_new_place = place;
4621 #if OMP_50_ENABLED
4622  if (__kmp_display_affinity && place != th->th.th_current_place &&
4623  team->t.t_display_affinity != 1) {
4624  team->t.t_display_affinity = 1;
4625  }
4626 #endif
4627 
4628  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4629  "partition = [%d,%d]\n",
4630  __kmp_gtid_from_thread(team->t.t_threads[f]),
4631  team->t.t_id, f, place, first_place, last_place));
4632  }
4633  } else {
4634  int S, rem, gap, s_count;
4635  S = n_th / n_places;
4636  s_count = 0;
4637  rem = n_th - (S * n_places);
4638  gap = rem > 0 ? n_places / rem : n_places;
4639  int place = masters_place;
4640  int gap_ct = gap;
4641  for (f = 0; f < n_th; f++) {
4642  kmp_info_t *th = team->t.t_threads[f];
4643  KMP_DEBUG_ASSERT(th != NULL);
4644 
4645  th->th.th_first_place = first_place;
4646  th->th.th_last_place = last_place;
4647  th->th.th_new_place = place;
4648 #if OMP_50_ENABLED
4649  if (__kmp_display_affinity && place != th->th.th_current_place &&
4650  team->t.t_display_affinity != 1) {
4651  team->t.t_display_affinity = 1;
4652  }
4653 #endif
4654  s_count++;
4655 
4656  if ((s_count == S) && rem && (gap_ct == gap)) {
4657  // do nothing, add an extra thread to place on next iteration
4658  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4659  // we added an extra thread to this place; move to next place
4660  if (place == last_place) {
4661  place = first_place;
4662  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4663  place = 0;
4664  } else {
4665  place++;
4666  }
4667  s_count = 0;
4668  gap_ct = 1;
4669  rem--;
4670  } else if (s_count == S) { // place full; don't add extra
4671  if (place == last_place) {
4672  place = first_place;
4673  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4674  place = 0;
4675  } else {
4676  place++;
4677  }
4678  gap_ct++;
4679  s_count = 0;
4680  }
4681 
4682  KA_TRACE(100,
4683  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4684  "partition = [%d,%d]\n",
4685  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4686  th->th.th_new_place, first_place, last_place));
4687  }
4688  KMP_DEBUG_ASSERT(place == masters_place);
4689  }
4690  } break;
4691 
4692  case proc_bind_spread: {
4693  int f;
4694  int n_th = team->t.t_nproc;
4695  int n_places;
4696  int thidx;
4697  if (first_place <= last_place) {
4698  n_places = last_place - first_place + 1;
4699  } else {
4700  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4701  }
4702  if (n_th <= n_places) {
4703  int place = -1;
4704 
4705  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4706  int S = n_places / n_th;
4707  int s_count, rem, gap, gap_ct;
4708 
4709  place = masters_place;
4710  rem = n_places - n_th * S;
4711  gap = rem ? n_th / rem : 1;
4712  gap_ct = gap;
4713  thidx = n_th;
4714  if (update_master_only == 1)
4715  thidx = 1;
4716  for (f = 0; f < thidx; f++) {
4717  kmp_info_t *th = team->t.t_threads[f];
4718  KMP_DEBUG_ASSERT(th != NULL);
4719 
4720  th->th.th_first_place = place;
4721  th->th.th_new_place = place;
4722 #if OMP_50_ENABLED
4723  if (__kmp_display_affinity && place != th->th.th_current_place &&
4724  team->t.t_display_affinity != 1) {
4725  team->t.t_display_affinity = 1;
4726  }
4727 #endif
4728  s_count = 1;
4729  while (s_count < S) {
4730  if (place == last_place) {
4731  place = first_place;
4732  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4733  place = 0;
4734  } else {
4735  place++;
4736  }
4737  s_count++;
4738  }
4739  if (rem && (gap_ct == gap)) {
4740  if (place == last_place) {
4741  place = first_place;
4742  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4743  place = 0;
4744  } else {
4745  place++;
4746  }
4747  rem--;
4748  gap_ct = 0;
4749  }
4750  th->th.th_last_place = place;
4751  gap_ct++;
4752 
4753  if (place == last_place) {
4754  place = first_place;
4755  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4756  place = 0;
4757  } else {
4758  place++;
4759  }
4760 
4761  KA_TRACE(100,
4762  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4763  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4764  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4765  f, th->th.th_new_place, th->th.th_first_place,
4766  th->th.th_last_place, __kmp_affinity_num_masks));
4767  }
4768  } else {
4769  /* Having uniform space of available computation places I can create
4770  T partitions of round(P/T) size and put threads into the first
4771  place of each partition. */
4772  double current = static_cast<double>(masters_place);
4773  double spacing =
4774  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4775  int first, last;
4776  kmp_info_t *th;
4777 
4778  thidx = n_th + 1;
4779  if (update_master_only == 1)
4780  thidx = 1;
4781  for (f = 0; f < thidx; f++) {
4782  first = static_cast<int>(current);
4783  last = static_cast<int>(current + spacing) - 1;
4784  KMP_DEBUG_ASSERT(last >= first);
4785  if (first >= n_places) {
4786  if (masters_place) {
4787  first -= n_places;
4788  last -= n_places;
4789  if (first == (masters_place + 1)) {
4790  KMP_DEBUG_ASSERT(f == n_th);
4791  first--;
4792  }
4793  if (last == masters_place) {
4794  KMP_DEBUG_ASSERT(f == (n_th - 1));
4795  last--;
4796  }
4797  } else {
4798  KMP_DEBUG_ASSERT(f == n_th);
4799  first = 0;
4800  last = 0;
4801  }
4802  }
4803  if (last >= n_places) {
4804  last = (n_places - 1);
4805  }
4806  place = first;
4807  current += spacing;
4808  if (f < n_th) {
4809  KMP_DEBUG_ASSERT(0 <= first);
4810  KMP_DEBUG_ASSERT(n_places > first);
4811  KMP_DEBUG_ASSERT(0 <= last);
4812  KMP_DEBUG_ASSERT(n_places > last);
4813  KMP_DEBUG_ASSERT(last_place >= first_place);
4814  th = team->t.t_threads[f];
4815  KMP_DEBUG_ASSERT(th);
4816  th->th.th_first_place = first;
4817  th->th.th_new_place = place;
4818  th->th.th_last_place = last;
4819 #if OMP_50_ENABLED
4820  if (__kmp_display_affinity && place != th->th.th_current_place &&
4821  team->t.t_display_affinity != 1) {
4822  team->t.t_display_affinity = 1;
4823  }
4824 #endif
4825  KA_TRACE(100,
4826  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4827  "partition = [%d,%d], spacing = %.4f\n",
4828  __kmp_gtid_from_thread(team->t.t_threads[f]),
4829  team->t.t_id, f, th->th.th_new_place,
4830  th->th.th_first_place, th->th.th_last_place, spacing));
4831  }
4832  }
4833  }
4834  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4835  } else {
4836  int S, rem, gap, s_count;
4837  S = n_th / n_places;
4838  s_count = 0;
4839  rem = n_th - (S * n_places);
4840  gap = rem > 0 ? n_places / rem : n_places;
4841  int place = masters_place;
4842  int gap_ct = gap;
4843  thidx = n_th;
4844  if (update_master_only == 1)
4845  thidx = 1;
4846  for (f = 0; f < thidx; f++) {
4847  kmp_info_t *th = team->t.t_threads[f];
4848  KMP_DEBUG_ASSERT(th != NULL);
4849 
4850  th->th.th_first_place = place;
4851  th->th.th_last_place = place;
4852  th->th.th_new_place = place;
4853 #if OMP_50_ENABLED
4854  if (__kmp_display_affinity && place != th->th.th_current_place &&
4855  team->t.t_display_affinity != 1) {
4856  team->t.t_display_affinity = 1;
4857  }
4858 #endif
4859  s_count++;
4860 
4861  if ((s_count == S) && rem && (gap_ct == gap)) {
4862  // do nothing, add an extra thread to place on next iteration
4863  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4864  // we added an extra thread to this place; move on to next place
4865  if (place == last_place) {
4866  place = first_place;
4867  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4868  place = 0;
4869  } else {
4870  place++;
4871  }
4872  s_count = 0;
4873  gap_ct = 1;
4874  rem--;
4875  } else if (s_count == S) { // place is full; don't add extra thread
4876  if (place == last_place) {
4877  place = first_place;
4878  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4879  place = 0;
4880  } else {
4881  place++;
4882  }
4883  gap_ct++;
4884  s_count = 0;
4885  }
4886 
4887  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4888  "partition = [%d,%d]\n",
4889  __kmp_gtid_from_thread(team->t.t_threads[f]),
4890  team->t.t_id, f, th->th.th_new_place,
4891  th->th.th_first_place, th->th.th_last_place));
4892  }
4893  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4894  }
4895  } break;
4896 
4897  default:
4898  break;
4899  }
4900 
4901  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4902 }
4903 
4904 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4905 
4906 /* allocate a new team data structure to use. take one off of the free pool if
4907  available */
4908 kmp_team_t *
4909 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4910 #if OMPT_SUPPORT
4911  ompt_data_t ompt_parallel_data,
4912 #endif
4913 #if OMP_40_ENABLED
4914  kmp_proc_bind_t new_proc_bind,
4915 #endif
4916  kmp_internal_control_t *new_icvs,
4917  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4918  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4919  int f;
4920  kmp_team_t *team;
4921  int use_hot_team = !root->r.r_active;
4922  int level = 0;
4923 
4924  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4925  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4926  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4927  KMP_MB();
4928 
4929 #if KMP_NESTED_HOT_TEAMS
4930  kmp_hot_team_ptr_t *hot_teams;
4931  if (master) {
4932  team = master->th.th_team;
4933  level = team->t.t_active_level;
4934  if (master->th.th_teams_microtask) { // in teams construct?
4935  if (master->th.th_teams_size.nteams > 1 &&
4936  ( // #teams > 1
4937  team->t.t_pkfn ==
4938  (microtask_t)__kmp_teams_master || // inner fork of the teams
4939  master->th.th_teams_level <
4940  team->t.t_level)) { // or nested parallel inside the teams
4941  ++level; // not increment if #teams==1, or for outer fork of the teams;
4942  // increment otherwise
4943  }
4944  }
4945  hot_teams = master->th.th_hot_teams;
4946  if (level < __kmp_hot_teams_max_level && hot_teams &&
4947  hot_teams[level]
4948  .hot_team) { // hot team has already been allocated for given level
4949  use_hot_team = 1;
4950  } else {
4951  use_hot_team = 0;
4952  }
4953  }
4954 #endif
4955  // Optimization to use a "hot" team
4956  if (use_hot_team && new_nproc > 1) {
4957  KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4958 #if KMP_NESTED_HOT_TEAMS
4959  team = hot_teams[level].hot_team;
4960 #else
4961  team = root->r.r_hot_team;
4962 #endif
4963 #if KMP_DEBUG
4964  if (__kmp_tasking_mode != tskm_immediate_exec) {
4965  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4966  "task_team[1] = %p before reinit\n",
4967  team->t.t_task_team[0], team->t.t_task_team[1]));
4968  }
4969 #endif
4970 
4971  // Has the number of threads changed?
4972  /* Let's assume the most common case is that the number of threads is
4973  unchanged, and put that case first. */
4974  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4975  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4976  // This case can mean that omp_set_num_threads() was called and the hot
4977  // team size was already reduced, so we check the special flag
4978  if (team->t.t_size_changed == -1) {
4979  team->t.t_size_changed = 1;
4980  } else {
4981  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4982  }
4983 
4984  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4985  kmp_r_sched_t new_sched = new_icvs->sched;
4986  // set master's schedule as new run-time schedule
4987  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4988 
4989  __kmp_reinitialize_team(team, new_icvs,
4990  root->r.r_uber_thread->th.th_ident);
4991 
4992  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4993  team->t.t_threads[0], team));
4994  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4995 
4996 #if OMP_40_ENABLED
4997 #if KMP_AFFINITY_SUPPORTED
4998  if ((team->t.t_size_changed == 0) &&
4999  (team->t.t_proc_bind == new_proc_bind)) {
5000  if (new_proc_bind == proc_bind_spread) {
5001  __kmp_partition_places(
5002  team, 1); // add flag to update only master for spread
5003  }
5004  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5005  "proc_bind = %d, partition = [%d,%d]\n",
5006  team->t.t_id, new_proc_bind, team->t.t_first_place,
5007  team->t.t_last_place));
5008  } else {
5009  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5010  __kmp_partition_places(team);
5011  }
5012 #else
5013  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5014 #endif /* KMP_AFFINITY_SUPPORTED */
5015 #endif /* OMP_40_ENABLED */
5016  } else if (team->t.t_nproc > new_nproc) {
5017  KA_TRACE(20,
5018  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5019  new_nproc));
5020 
5021  team->t.t_size_changed = 1;
5022 #if KMP_NESTED_HOT_TEAMS
5023  if (__kmp_hot_teams_mode == 0) {
5024  // AC: saved number of threads should correspond to team's value in this
5025  // mode, can be bigger in mode 1, when hot team has threads in reserve
5026  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5027  hot_teams[level].hot_team_nth = new_nproc;
5028 #endif // KMP_NESTED_HOT_TEAMS
5029  /* release the extra threads we don't need any more */
5030  for (f = new_nproc; f < team->t.t_nproc; f++) {
5031  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5032  if (__kmp_tasking_mode != tskm_immediate_exec) {
5033  // When decreasing team size, threads no longer in the team should
5034  // unref task team.
5035  team->t.t_threads[f]->th.th_task_team = NULL;
5036  }
5037  __kmp_free_thread(team->t.t_threads[f]);
5038  team->t.t_threads[f] = NULL;
5039  }
5040 #if KMP_NESTED_HOT_TEAMS
5041  } // (__kmp_hot_teams_mode == 0)
5042  else {
5043  // When keeping extra threads in team, switch threads to wait on own
5044  // b_go flag
5045  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5046  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5047  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5048  for (int b = 0; b < bs_last_barrier; ++b) {
5049  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5050  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5051  }
5052  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5053  }
5054  }
5055  }
5056 #endif // KMP_NESTED_HOT_TEAMS
5057  team->t.t_nproc = new_nproc;
5058  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5059  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5060  __kmp_reinitialize_team(team, new_icvs,
5061  root->r.r_uber_thread->th.th_ident);
5062 
5063  /* update the remaining threads */
5064  for (f = 0; f < new_nproc; ++f) {
5065  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5066  }
5067  // restore the current task state of the master thread: should be the
5068  // implicit task
5069  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5070  team->t.t_threads[0], team));
5071 
5072  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5073 
5074 #ifdef KMP_DEBUG
5075  for (f = 0; f < team->t.t_nproc; f++) {
5076  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5077  team->t.t_threads[f]->th.th_team_nproc ==
5078  team->t.t_nproc);
5079  }
5080 #endif
5081 
5082 #if OMP_40_ENABLED
5083  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5084 #if KMP_AFFINITY_SUPPORTED
5085  __kmp_partition_places(team);
5086 #endif
5087 #endif
5088  } else { // team->t.t_nproc < new_nproc
5089 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5090  kmp_affin_mask_t *old_mask;
5091  if (KMP_AFFINITY_CAPABLE()) {
5092  KMP_CPU_ALLOC(old_mask);
5093  }
5094 #endif
5095 
5096  KA_TRACE(20,
5097  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5098  new_nproc));
5099 
5100  team->t.t_size_changed = 1;
5101 
5102 #if KMP_NESTED_HOT_TEAMS
5103  int avail_threads = hot_teams[level].hot_team_nth;
5104  if (new_nproc < avail_threads)
5105  avail_threads = new_nproc;
5106  kmp_info_t **other_threads = team->t.t_threads;
5107  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5108  // Adjust barrier data of reserved threads (if any) of the team
5109  // Other data will be set in __kmp_initialize_info() below.
5110  int b;
5111  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5112  for (b = 0; b < bs_last_barrier; ++b) {
5113  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5114  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5115 #if USE_DEBUGGER
5116  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5117 #endif
5118  }
5119  }
5120  if (hot_teams[level].hot_team_nth >= new_nproc) {
5121  // we have all needed threads in reserve, no need to allocate any
5122  // this only possible in mode 1, cannot have reserved threads in mode 0
5123  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5124  team->t.t_nproc = new_nproc; // just get reserved threads involved
5125  } else {
5126  // we may have some threads in reserve, but not enough
5127  team->t.t_nproc =
5128  hot_teams[level]
5129  .hot_team_nth; // get reserved threads involved if any
5130  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5131 #endif // KMP_NESTED_HOT_TEAMS
5132  if (team->t.t_max_nproc < new_nproc) {
5133  /* reallocate larger arrays */
5134  __kmp_reallocate_team_arrays(team, new_nproc);
5135  __kmp_reinitialize_team(team, new_icvs, NULL);
5136  }
5137 
5138 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5139  /* Temporarily set full mask for master thread before creation of
5140  workers. The reason is that workers inherit the affinity from master,
5141  so if a lot of workers are created on the single core quickly, they
5142  don't get a chance to set their own affinity for a long time. */
5143  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5144 #endif
5145 
5146  /* allocate new threads for the hot team */
5147  for (f = team->t.t_nproc; f < new_nproc; f++) {
5148  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5149  KMP_DEBUG_ASSERT(new_worker);
5150  team->t.t_threads[f] = new_worker;
5151 
5152  KA_TRACE(20,
5153  ("__kmp_allocate_team: team %d init T#%d arrived: "
5154  "join=%llu, plain=%llu\n",
5155  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5156  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5157  team->t.t_bar[bs_plain_barrier].b_arrived));
5158 
5159  { // Initialize barrier data for new threads.
5160  int b;
5161  kmp_balign_t *balign = new_worker->th.th_bar;
5162  for (b = 0; b < bs_last_barrier; ++b) {
5163  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5164  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5165  KMP_BARRIER_PARENT_FLAG);
5166 #if USE_DEBUGGER
5167  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5168 #endif
5169  }
5170  }
5171  }
5172 
5173 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5174  if (KMP_AFFINITY_CAPABLE()) {
5175  /* Restore initial master thread's affinity mask */
5176  __kmp_set_system_affinity(old_mask, TRUE);
5177  KMP_CPU_FREE(old_mask);
5178  }
5179 #endif
5180 #if KMP_NESTED_HOT_TEAMS
5181  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5182 #endif // KMP_NESTED_HOT_TEAMS
5183  /* make sure everyone is syncronized */
5184  int old_nproc = team->t.t_nproc; // save old value and use to update only
5185  // new threads below
5186  __kmp_initialize_team(team, new_nproc, new_icvs,
5187  root->r.r_uber_thread->th.th_ident);
5188 
5189  /* reinitialize the threads */
5190  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5191  for (f = 0; f < team->t.t_nproc; ++f)
5192  __kmp_initialize_info(team->t.t_threads[f], team, f,
5193  __kmp_gtid_from_tid(f, team));
5194  if (level) { // set th_task_state for new threads in nested hot team
5195  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5196  // only need to set the th_task_state for the new threads. th_task_state
5197  // for master thread will not be accurate until after this in
5198  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5199  // correct value.
5200  for (f = old_nproc; f < team->t.t_nproc; ++f)
5201  team->t.t_threads[f]->th.th_task_state =
5202  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5203  } else { // set th_task_state for new threads in non-nested hot team
5204  int old_state =
5205  team->t.t_threads[0]->th.th_task_state; // copy master's state
5206  for (f = old_nproc; f < team->t.t_nproc; ++f)
5207  team->t.t_threads[f]->th.th_task_state = old_state;
5208  }
5209 
5210 #ifdef KMP_DEBUG
5211  for (f = 0; f < team->t.t_nproc; ++f) {
5212  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5213  team->t.t_threads[f]->th.th_team_nproc ==
5214  team->t.t_nproc);
5215  }
5216 #endif
5217 
5218 #if OMP_40_ENABLED
5219  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5220 #if KMP_AFFINITY_SUPPORTED
5221  __kmp_partition_places(team);
5222 #endif
5223 #endif
5224  } // Check changes in number of threads
5225 
5226 #if OMP_40_ENABLED
5227  kmp_info_t *master = team->t.t_threads[0];
5228  if (master->th.th_teams_microtask) {
5229  for (f = 1; f < new_nproc; ++f) {
5230  // propagate teams construct specific info to workers
5231  kmp_info_t *thr = team->t.t_threads[f];
5232  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5233  thr->th.th_teams_level = master->th.th_teams_level;
5234  thr->th.th_teams_size = master->th.th_teams_size;
5235  }
5236  }
5237 #endif /* OMP_40_ENABLED */
5238 #if KMP_NESTED_HOT_TEAMS
5239  if (level) {
5240  // Sync barrier state for nested hot teams, not needed for outermost hot
5241  // team.
5242  for (f = 1; f < new_nproc; ++f) {
5243  kmp_info_t *thr = team->t.t_threads[f];
5244  int b;
5245  kmp_balign_t *balign = thr->th.th_bar;
5246  for (b = 0; b < bs_last_barrier; ++b) {
5247  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5248  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5249 #if USE_DEBUGGER
5250  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5251 #endif
5252  }
5253  }
5254  }
5255 #endif // KMP_NESTED_HOT_TEAMS
5256 
5257  /* reallocate space for arguments if necessary */
5258  __kmp_alloc_argv_entries(argc, team, TRUE);
5259  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5260  // The hot team re-uses the previous task team,
5261  // if untouched during the previous release->gather phase.
5262 
5263  KF_TRACE(10, (" hot_team = %p\n", team));
5264 
5265 #if KMP_DEBUG
5266  if (__kmp_tasking_mode != tskm_immediate_exec) {
5267  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5268  "task_team[1] = %p after reinit\n",
5269  team->t.t_task_team[0], team->t.t_task_team[1]));
5270  }
5271 #endif
5272 
5273 #if OMPT_SUPPORT
5274  __ompt_team_assign_id(team, ompt_parallel_data);
5275 #endif
5276 
5277  KMP_MB();
5278 
5279  return team;
5280  }
5281 
5282  /* next, let's try to take one from the team pool */
5283  KMP_MB();
5284  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5285  /* TODO: consider resizing undersized teams instead of reaping them, now
5286  that we have a resizing mechanism */
5287  if (team->t.t_max_nproc >= max_nproc) {
5288  /* take this team from the team pool */
5289  __kmp_team_pool = team->t.t_next_pool;
5290 
5291  /* setup the team for fresh use */
5292  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5293 
5294  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5295  "task_team[1] %p to NULL\n",
5296  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5297  team->t.t_task_team[0] = NULL;
5298  team->t.t_task_team[1] = NULL;
5299 
5300  /* reallocate space for arguments if necessary */
5301  __kmp_alloc_argv_entries(argc, team, TRUE);
5302  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5303 
5304  KA_TRACE(
5305  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5306  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5307  { // Initialize barrier data.
5308  int b;
5309  for (b = 0; b < bs_last_barrier; ++b) {
5310  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5311 #if USE_DEBUGGER
5312  team->t.t_bar[b].b_master_arrived = 0;
5313  team->t.t_bar[b].b_team_arrived = 0;
5314 #endif
5315  }
5316  }
5317 
5318 #if OMP_40_ENABLED
5319  team->t.t_proc_bind = new_proc_bind;
5320 #endif
5321 
5322  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5323  team->t.t_id));
5324 
5325 #if OMPT_SUPPORT
5326  __ompt_team_assign_id(team, ompt_parallel_data);
5327 #endif
5328 
5329  KMP_MB();
5330 
5331  return team;
5332  }
5333 
5334  /* reap team if it is too small, then loop back and check the next one */
5335  // not sure if this is wise, but, will be redone during the hot-teams
5336  // rewrite.
5337  /* TODO: Use technique to find the right size hot-team, don't reap them */
5338  team = __kmp_reap_team(team);
5339  __kmp_team_pool = team;
5340  }
5341 
5342  /* nothing available in the pool, no matter, make a new team! */
5343  KMP_MB();
5344  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5345 
5346  /* and set it up */
5347  team->t.t_max_nproc = max_nproc;
5348  /* NOTE well, for some reason allocating one big buffer and dividing it up
5349  seems to really hurt performance a lot on the P4, so, let's not use this */
5350  __kmp_allocate_team_arrays(team, max_nproc);
5351 
5352  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5353  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5354 
5355  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5356  "%p to NULL\n",
5357  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5358  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5359  // memory, no need to duplicate
5360  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5361  // memory, no need to duplicate
5362 
5363  if (__kmp_storage_map) {
5364  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5365  }
5366 
5367  /* allocate space for arguments */
5368  __kmp_alloc_argv_entries(argc, team, FALSE);
5369  team->t.t_argc = argc;
5370 
5371  KA_TRACE(20,
5372  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5373  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5374  { // Initialize barrier data.
5375  int b;
5376  for (b = 0; b < bs_last_barrier; ++b) {
5377  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5378 #if USE_DEBUGGER
5379  team->t.t_bar[b].b_master_arrived = 0;
5380  team->t.t_bar[b].b_team_arrived = 0;
5381 #endif
5382  }
5383  }
5384 
5385 #if OMP_40_ENABLED
5386  team->t.t_proc_bind = new_proc_bind;
5387 #endif
5388 
5389 #if OMPT_SUPPORT
5390  __ompt_team_assign_id(team, ompt_parallel_data);
5391  team->t.ompt_serialized_team_info = NULL;
5392 #endif
5393 
5394  KMP_MB();
5395 
5396  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5397  team->t.t_id));
5398 
5399  return team;
5400 }
5401 
5402 /* TODO implement hot-teams at all levels */
5403 /* TODO implement lazy thread release on demand (disband request) */
5404 
5405 /* free the team. return it to the team pool. release all the threads
5406  * associated with it */
5407 void __kmp_free_team(kmp_root_t *root,
5408  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5409  int f;
5410  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5411  team->t.t_id));
5412 
5413  /* verify state */
5414  KMP_DEBUG_ASSERT(root);
5415  KMP_DEBUG_ASSERT(team);
5416  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5417  KMP_DEBUG_ASSERT(team->t.t_threads);
5418 
5419  int use_hot_team = team == root->r.r_hot_team;
5420 #if KMP_NESTED_HOT_TEAMS
5421  int level;
5422  kmp_hot_team_ptr_t *hot_teams;
5423  if (master) {
5424  level = team->t.t_active_level - 1;
5425  if (master->th.th_teams_microtask) { // in teams construct?
5426  if (master->th.th_teams_size.nteams > 1) {
5427  ++level; // level was not increased in teams construct for
5428  // team_of_masters
5429  }
5430  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5431  master->th.th_teams_level == team->t.t_level) {
5432  ++level; // level was not increased in teams construct for
5433  // team_of_workers before the parallel
5434  } // team->t.t_level will be increased inside parallel
5435  }
5436  hot_teams = master->th.th_hot_teams;
5437  if (level < __kmp_hot_teams_max_level) {
5438  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5439  use_hot_team = 1;
5440  }
5441  }
5442 #endif // KMP_NESTED_HOT_TEAMS
5443 
5444  /* team is done working */
5445  TCW_SYNC_PTR(team->t.t_pkfn,
5446  NULL); // Important for Debugging Support Library.
5447 #if KMP_OS_WINDOWS
5448  team->t.t_copyin_counter = 0; // init counter for possible reuse
5449 #endif
5450  // Do not reset pointer to parent team to NULL for hot teams.
5451 
5452  /* if we are non-hot team, release our threads */
5453  if (!use_hot_team) {
5454  if (__kmp_tasking_mode != tskm_immediate_exec) {
5455  // Wait for threads to reach reapable state
5456  for (f = 1; f < team->t.t_nproc; ++f) {
5457  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5458  kmp_info_t *th = team->t.t_threads[f];
5459  volatile kmp_uint32 *state = &th->th.th_reap_state;
5460  while (*state != KMP_SAFE_TO_REAP) {
5461 #if KMP_OS_WINDOWS
5462  // On Windows a thread can be killed at any time, check this
5463  DWORD ecode;
5464  if (!__kmp_is_thread_alive(th, &ecode)) {
5465  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5466  break;
5467  }
5468 #endif
5469  // first check if thread is sleeping
5470  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5471  if (fl.is_sleeping())
5472  fl.resume(__kmp_gtid_from_thread(th));
5473  KMP_CPU_PAUSE();
5474  }
5475  }
5476 
5477  // Delete task teams
5478  int tt_idx;
5479  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5480  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5481  if (task_team != NULL) {
5482  for (f = 0; f < team->t.t_nproc;
5483  ++f) { // Have all threads unref task teams
5484  team->t.t_threads[f]->th.th_task_team = NULL;
5485  }
5486  KA_TRACE(
5487  20,
5488  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5489  __kmp_get_gtid(), task_team, team->t.t_id));
5490 #if KMP_NESTED_HOT_TEAMS
5491  __kmp_free_task_team(master, task_team);
5492 #endif
5493  team->t.t_task_team[tt_idx] = NULL;
5494  }
5495  }
5496  }
5497 
5498  // Reset pointer to parent team only for non-hot teams.
5499  team->t.t_parent = NULL;
5500  team->t.t_level = 0;
5501  team->t.t_active_level = 0;
5502 
5503  /* free the worker threads */
5504  for (f = 1; f < team->t.t_nproc; ++f) {
5505  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5506  __kmp_free_thread(team->t.t_threads[f]);
5507  team->t.t_threads[f] = NULL;
5508  }
5509 
5510  /* put the team back in the team pool */
5511  /* TODO limit size of team pool, call reap_team if pool too large */
5512  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5513  __kmp_team_pool = (volatile kmp_team_t *)team;
5514  }
5515 
5516  KMP_MB();
5517 }
5518 
5519 /* reap the team. destroy it, reclaim all its resources and free its memory */
5520 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5521  kmp_team_t *next_pool = team->t.t_next_pool;
5522 
5523  KMP_DEBUG_ASSERT(team);
5524  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5525  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5526  KMP_DEBUG_ASSERT(team->t.t_threads);
5527  KMP_DEBUG_ASSERT(team->t.t_argv);
5528 
5529  /* TODO clean the threads that are a part of this? */
5530 
5531  /* free stuff */
5532  __kmp_free_team_arrays(team);
5533  if (team->t.t_argv != &team->t.t_inline_argv[0])
5534  __kmp_free((void *)team->t.t_argv);
5535  __kmp_free(team);
5536 
5537  KMP_MB();
5538  return next_pool;
5539 }
5540 
5541 // Free the thread. Don't reap it, just place it on the pool of available
5542 // threads.
5543 //
5544 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5545 // binding for the affinity mechanism to be useful.
5546 //
5547 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5548 // However, we want to avoid a potential performance problem by always
5549 // scanning through the list to find the correct point at which to insert
5550 // the thread (potential N**2 behavior). To do this we keep track of the
5551 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5552 // With single-level parallelism, threads will always be added to the tail
5553 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5554 // parallelism, all bets are off and we may need to scan through the entire
5555 // free list.
5556 //
5557 // This change also has a potentially large performance benefit, for some
5558 // applications. Previously, as threads were freed from the hot team, they
5559 // would be placed back on the free list in inverse order. If the hot team
5560 // grew back to it's original size, then the freed thread would be placed
5561 // back on the hot team in reverse order. This could cause bad cache
5562 // locality problems on programs where the size of the hot team regularly
5563 // grew and shrunk.
5564 //
5565 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5566 void __kmp_free_thread(kmp_info_t *this_th) {
5567  int gtid;
5568  kmp_info_t **scan;
5569  kmp_root_t *root = this_th->th.th_root;
5570 
5571  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5572  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5573 
5574  KMP_DEBUG_ASSERT(this_th);
5575 
5576  // When moving thread to pool, switch thread to wait on own b_go flag, and
5577  // uninitialized (NULL team).
5578  int b;
5579  kmp_balign_t *balign = this_th->th.th_bar;
5580  for (b = 0; b < bs_last_barrier; ++b) {
5581  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5582  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5583  balign[b].bb.team = NULL;
5584  balign[b].bb.leaf_kids = 0;
5585  }
5586  this_th->th.th_task_state = 0;
5587  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5588 
5589  /* put thread back on the free pool */
5590  TCW_PTR(this_th->th.th_team, NULL);
5591  TCW_PTR(this_th->th.th_root, NULL);
5592  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5593 
5594  /* If the implicit task assigned to this thread can be used by other threads
5595  * -> multiple threads can share the data and try to free the task at
5596  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5597  * with higher probability when hot team is disabled but can occurs even when
5598  * the hot team is enabled */
5599  __kmp_free_implicit_task(this_th);
5600  this_th->th.th_current_task = NULL;
5601 
5602  // If the __kmp_thread_pool_insert_pt is already past the new insert
5603  // point, then we need to re-scan the entire list.
5604  gtid = this_th->th.th_info.ds.ds_gtid;
5605  if (__kmp_thread_pool_insert_pt != NULL) {
5606  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5607  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5608  __kmp_thread_pool_insert_pt = NULL;
5609  }
5610  }
5611 
5612  // Scan down the list to find the place to insert the thread.
5613  // scan is the address of a link in the list, possibly the address of
5614  // __kmp_thread_pool itself.
5615  //
5616  // In the absence of nested parallism, the for loop will have 0 iterations.
5617  if (__kmp_thread_pool_insert_pt != NULL) {
5618  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5619  } else {
5620  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5621  }
5622  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5623  scan = &((*scan)->th.th_next_pool))
5624  ;
5625 
5626  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5627  // to its address.
5628  TCW_PTR(this_th->th.th_next_pool, *scan);
5629  __kmp_thread_pool_insert_pt = *scan = this_th;
5630  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5631  (this_th->th.th_info.ds.ds_gtid <
5632  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5633  TCW_4(this_th->th.th_in_pool, TRUE);
5634  __kmp_thread_pool_nth++;
5635 
5636  TCW_4(__kmp_nth, __kmp_nth - 1);
5637  root->r.r_cg_nthreads--;
5638 
5639 #ifdef KMP_ADJUST_BLOCKTIME
5640  /* Adjust blocktime back to user setting or default if necessary */
5641  /* Middle initialization might never have occurred */
5642  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5643  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5644  if (__kmp_nth <= __kmp_avail_proc) {
5645  __kmp_zero_bt = FALSE;
5646  }
5647  }
5648 #endif /* KMP_ADJUST_BLOCKTIME */
5649 
5650  KMP_MB();
5651 }
5652 
5653 /* ------------------------------------------------------------------------ */
5654 
5655 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5656  int gtid = this_thr->th.th_info.ds.ds_gtid;
5657  /* void *stack_data;*/
5658  kmp_team_t *(*volatile pteam);
5659 
5660  KMP_MB();
5661  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5662 
5663  if (__kmp_env_consistency_check) {
5664  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5665  }
5666 
5667 #if OMPT_SUPPORT
5668  ompt_data_t *thread_data;
5669  if (ompt_enabled.enabled) {
5670  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5671  *thread_data = ompt_data_none;
5672 
5673  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5674  this_thr->th.ompt_thread_info.wait_id = 0;
5675  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5676  if (ompt_enabled.ompt_callback_thread_begin) {
5677  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5678  ompt_thread_worker, thread_data);
5679  }
5680  }
5681 #endif
5682 
5683 #if OMPT_SUPPORT
5684  if (ompt_enabled.enabled) {
5685  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5686  }
5687 #endif
5688  /* This is the place where threads wait for work */
5689  while (!TCR_4(__kmp_global.g.g_done)) {
5690  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5691  KMP_MB();
5692 
5693  /* wait for work to do */
5694  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5695 
5696  /* No tid yet since not part of a team */
5697  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5698 
5699 #if OMPT_SUPPORT
5700  if (ompt_enabled.enabled) {
5701  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5702  }
5703 #endif
5704 
5705  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5706 
5707  /* have we been allocated? */
5708  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5709  /* we were just woken up, so run our new task */
5710  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5711  int rc;
5712  KA_TRACE(20,
5713  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5714  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5715  (*pteam)->t.t_pkfn));
5716 
5717  updateHWFPControl(*pteam);
5718 
5719 #if OMPT_SUPPORT
5720  if (ompt_enabled.enabled) {
5721  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5722  }
5723 #endif
5724 
5725  rc = (*pteam)->t.t_invoke(gtid);
5726  KMP_ASSERT(rc);
5727 
5728  KMP_MB();
5729  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5730  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5731  (*pteam)->t.t_pkfn));
5732  }
5733 #if OMPT_SUPPORT
5734  if (ompt_enabled.enabled) {
5735  /* no frame set while outside task */
5736  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5737 
5738  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5739  }
5740 #endif
5741  /* join barrier after parallel region */
5742  __kmp_join_barrier(gtid);
5743  }
5744  }
5745  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5746 
5747 #if OMPT_SUPPORT
5748  if (ompt_enabled.ompt_callback_thread_end) {
5749  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5750  }
5751 #endif
5752 
5753  this_thr->th.th_task_team = NULL;
5754  /* run the destructors for the threadprivate data for this thread */
5755  __kmp_common_destroy_gtid(gtid);
5756 
5757  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5758  KMP_MB();
5759  return this_thr;
5760 }
5761 
5762 /* ------------------------------------------------------------------------ */
5763 
5764 void __kmp_internal_end_dest(void *specific_gtid) {
5765 #if KMP_COMPILER_ICC
5766 #pragma warning(push)
5767 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5768 // significant bits
5769 #endif
5770  // Make sure no significant bits are lost
5771  int gtid = (kmp_intptr_t)specific_gtid - 1;
5772 #if KMP_COMPILER_ICC
5773 #pragma warning(pop)
5774 #endif
5775 
5776  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5777  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5778  * this is because 0 is reserved for the nothing-stored case */
5779 
5780  /* josh: One reason for setting the gtid specific data even when it is being
5781  destroyed by pthread is to allow gtid lookup through thread specific data
5782  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5783  that gets executed in the call to __kmp_internal_end_thread, actually
5784  gets the gtid through the thread specific data. Setting it here seems
5785  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5786  to run smoothly.
5787  todo: get rid of this after we remove the dependence on
5788  __kmp_gtid_get_specific */
5789  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5790  __kmp_gtid_set_specific(gtid);
5791 #ifdef KMP_TDATA_GTID
5792  __kmp_gtid = gtid;
5793 #endif
5794  __kmp_internal_end_thread(gtid);
5795 }
5796 
5797 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5798 
5799 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5800 // destructors work perfectly, but in real libomp.so I have no evidence it is
5801 // ever called. However, -fini linker option in makefile.mk works fine.
5802 
5803 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5804  __kmp_internal_end_atexit();
5805 }
5806 
5807 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5808 
5809 #endif
5810 
5811 /* [Windows] josh: when the atexit handler is called, there may still be more
5812  than one thread alive */
5813 void __kmp_internal_end_atexit(void) {
5814  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5815  /* [Windows]
5816  josh: ideally, we want to completely shutdown the library in this atexit
5817  handler, but stat code that depends on thread specific data for gtid fails
5818  because that data becomes unavailable at some point during the shutdown, so
5819  we call __kmp_internal_end_thread instead. We should eventually remove the
5820  dependency on __kmp_get_specific_gtid in the stat code and use
5821  __kmp_internal_end_library to cleanly shutdown the library.
5822 
5823  // TODO: Can some of this comment about GVS be removed?
5824  I suspect that the offending stat code is executed when the calling thread
5825  tries to clean up a dead root thread's data structures, resulting in GVS
5826  code trying to close the GVS structures for that thread, but since the stat
5827  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5828  the calling thread is cleaning up itself instead of another thread, it get
5829  confused. This happens because allowing a thread to unregister and cleanup
5830  another thread is a recent modification for addressing an issue.
5831  Based on the current design (20050722), a thread may end up
5832  trying to unregister another thread only if thread death does not trigger
5833  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5834  thread specific data destructor function to detect thread death. For
5835  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5836  is nothing. Thus, the workaround is applicable only for Windows static
5837  stat library. */
5838  __kmp_internal_end_library(-1);
5839 #if KMP_OS_WINDOWS
5840  __kmp_close_console();
5841 #endif
5842 }
5843 
5844 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5845  // It is assumed __kmp_forkjoin_lock is acquired.
5846 
5847  int gtid;
5848 
5849  KMP_DEBUG_ASSERT(thread != NULL);
5850 
5851  gtid = thread->th.th_info.ds.ds_gtid;
5852 
5853  if (!is_root) {
5854 
5855  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5856  /* Assume the threads are at the fork barrier here */
5857  KA_TRACE(
5858  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5859  gtid));
5860  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5861  * (GEH) */
5862  ANNOTATE_HAPPENS_BEFORE(thread);
5863  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5864  __kmp_release_64(&flag);
5865  }
5866 
5867  // Terminate OS thread.
5868  __kmp_reap_worker(thread);
5869 
5870  // The thread was killed asynchronously. If it was actively
5871  // spinning in the thread pool, decrement the global count.
5872  //
5873  // There is a small timing hole here - if the worker thread was just waking
5874  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5875  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5876  // the global counter might not get updated.
5877  //
5878  // Currently, this can only happen as the library is unloaded,
5879  // so there are no harmful side effects.
5880  if (thread->th.th_active_in_pool) {
5881  thread->th.th_active_in_pool = FALSE;
5882  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5883  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5884  }
5885 
5886  // Decrement # of [worker] threads in the pool.
5887  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5888  --__kmp_thread_pool_nth;
5889  }
5890 
5891  __kmp_free_implicit_task(thread);
5892 
5893 // Free the fast memory for tasking
5894 #if USE_FAST_MEMORY
5895  __kmp_free_fast_memory(thread);
5896 #endif /* USE_FAST_MEMORY */
5897 
5898  __kmp_suspend_uninitialize_thread(thread);
5899 
5900  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5901  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5902 
5903  --__kmp_all_nth;
5904 // __kmp_nth was decremented when thread is added to the pool.
5905 
5906 #ifdef KMP_ADJUST_BLOCKTIME
5907  /* Adjust blocktime back to user setting or default if necessary */
5908  /* Middle initialization might never have occurred */
5909  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5910  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5911  if (__kmp_nth <= __kmp_avail_proc) {
5912  __kmp_zero_bt = FALSE;
5913  }
5914  }
5915 #endif /* KMP_ADJUST_BLOCKTIME */
5916 
5917  /* free the memory being used */
5918  if (__kmp_env_consistency_check) {
5919  if (thread->th.th_cons) {
5920  __kmp_free_cons_stack(thread->th.th_cons);
5921  thread->th.th_cons = NULL;
5922  }
5923  }
5924 
5925  if (thread->th.th_pri_common != NULL) {
5926  __kmp_free(thread->th.th_pri_common);
5927  thread->th.th_pri_common = NULL;
5928  }
5929 
5930  if (thread->th.th_task_state_memo_stack != NULL) {
5931  __kmp_free(thread->th.th_task_state_memo_stack);
5932  thread->th.th_task_state_memo_stack = NULL;
5933  }
5934 
5935 #if KMP_USE_BGET
5936  if (thread->th.th_local.bget_data != NULL) {
5937  __kmp_finalize_bget(thread);
5938  }
5939 #endif
5940 
5941 #if KMP_AFFINITY_SUPPORTED
5942  if (thread->th.th_affin_mask != NULL) {
5943  KMP_CPU_FREE(thread->th.th_affin_mask);
5944  thread->th.th_affin_mask = NULL;
5945  }
5946 #endif /* KMP_AFFINITY_SUPPORTED */
5947 
5948 #if KMP_USE_HIER_SCHED
5949  if (thread->th.th_hier_bar_data != NULL) {
5950  __kmp_free(thread->th.th_hier_bar_data);
5951  thread->th.th_hier_bar_data = NULL;
5952  }
5953 #endif
5954 
5955  __kmp_reap_team(thread->th.th_serial_team);
5956  thread->th.th_serial_team = NULL;
5957  __kmp_free(thread);
5958 
5959  KMP_MB();
5960 
5961 } // __kmp_reap_thread
5962 
5963 static void __kmp_internal_end(void) {
5964  int i;
5965 
5966  /* First, unregister the library */
5967  __kmp_unregister_library();
5968 
5969 #if KMP_OS_WINDOWS
5970  /* In Win static library, we can't tell when a root actually dies, so we
5971  reclaim the data structures for any root threads that have died but not
5972  unregistered themselves, in order to shut down cleanly.
5973  In Win dynamic library we also can't tell when a thread dies. */
5974  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5975 // dead roots
5976 #endif
5977 
5978  for (i = 0; i < __kmp_threads_capacity; i++)
5979  if (__kmp_root[i])
5980  if (__kmp_root[i]->r.r_active)
5981  break;
5982  KMP_MB(); /* Flush all pending memory write invalidates. */
5983  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5984 
5985  if (i < __kmp_threads_capacity) {
5986 #if KMP_USE_MONITOR
5987  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5988  KMP_MB(); /* Flush all pending memory write invalidates. */
5989 
5990  // Need to check that monitor was initialized before reaping it. If we are
5991  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5992  // __kmp_monitor will appear to contain valid data, but it is only valid in
5993  // the parent process, not the child.
5994  // New behavior (201008): instead of keying off of the flag
5995  // __kmp_init_parallel, the monitor thread creation is keyed off
5996  // of the new flag __kmp_init_monitor.
5997  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5998  if (TCR_4(__kmp_init_monitor)) {
5999  __kmp_reap_monitor(&__kmp_monitor);
6000  TCW_4(__kmp_init_monitor, 0);
6001  }
6002  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6003  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6004 #endif // KMP_USE_MONITOR
6005  } else {
6006 /* TODO move this to cleanup code */
6007 #ifdef KMP_DEBUG
6008  /* make sure that everything has properly ended */
6009  for (i = 0; i < __kmp_threads_capacity; i++) {
6010  if (__kmp_root[i]) {
6011  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6012  // there can be uber threads alive here
6013  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6014  }
6015  }
6016 #endif
6017 
6018  KMP_MB();
6019 
6020  // Reap the worker threads.
6021  // This is valid for now, but be careful if threads are reaped sooner.
6022  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6023  // Get the next thread from the pool.
6024  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6025  __kmp_thread_pool = thread->th.th_next_pool;
6026  // Reap it.
6027  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6028  thread->th.th_next_pool = NULL;
6029  thread->th.th_in_pool = FALSE;
6030  __kmp_reap_thread(thread, 0);
6031  }
6032  __kmp_thread_pool_insert_pt = NULL;
6033 
6034  // Reap teams.
6035  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6036  // Get the next team from the pool.
6037  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6038  __kmp_team_pool = team->t.t_next_pool;
6039  // Reap it.
6040  team->t.t_next_pool = NULL;
6041  __kmp_reap_team(team);
6042  }
6043 
6044  __kmp_reap_task_teams();
6045 
6046 #if KMP_OS_UNIX
6047  // Threads that are not reaped should not access any resources since they
6048  // are going to be deallocated soon, so the shutdown sequence should wait
6049  // until all threads either exit the final spin-waiting loop or begin
6050  // sleeping after the given blocktime.
6051  for (i = 0; i < __kmp_threads_capacity; i++) {
6052  kmp_info_t *thr = __kmp_threads[i];
6053  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6054  KMP_CPU_PAUSE();
6055  }
6056 #endif
6057 
6058  for (i = 0; i < __kmp_threads_capacity; ++i) {
6059  // TBD: Add some checking...
6060  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6061  }
6062 
6063  /* Make sure all threadprivate destructors get run by joining with all
6064  worker threads before resetting this flag */
6065  TCW_SYNC_4(__kmp_init_common, FALSE);
6066 
6067  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6068  KMP_MB();
6069 
6070 #if KMP_USE_MONITOR
6071  // See note above: One of the possible fixes for CQ138434 / CQ140126
6072  //
6073  // FIXME: push both code fragments down and CSE them?
6074  // push them into __kmp_cleanup() ?
6075  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6076  if (TCR_4(__kmp_init_monitor)) {
6077  __kmp_reap_monitor(&__kmp_monitor);
6078  TCW_4(__kmp_init_monitor, 0);
6079  }
6080  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6081  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6082 #endif
6083  } /* else !__kmp_global.t_active */
6084  TCW_4(__kmp_init_gtid, FALSE);
6085  KMP_MB(); /* Flush all pending memory write invalidates. */
6086 
6087  __kmp_cleanup();
6088 #if OMPT_SUPPORT
6089  ompt_fini();
6090 #endif
6091 }
6092 
6093 void __kmp_internal_end_library(int gtid_req) {
6094  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6095  /* this shouldn't be a race condition because __kmp_internal_end() is the
6096  only place to clear __kmp_serial_init */
6097  /* we'll check this later too, after we get the lock */
6098  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6099  // redundaant, because the next check will work in any case.
6100  if (__kmp_global.g.g_abort) {
6101  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6102  /* TODO abort? */
6103  return;
6104  }
6105  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6106  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6107  return;
6108  }
6109 
6110  KMP_MB(); /* Flush all pending memory write invalidates. */
6111 
6112  /* find out who we are and what we should do */
6113  {
6114  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6115  KA_TRACE(
6116  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6117  if (gtid == KMP_GTID_SHUTDOWN) {
6118  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6119  "already shutdown\n"));
6120  return;
6121  } else if (gtid == KMP_GTID_MONITOR) {
6122  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6123  "registered, or system shutdown\n"));
6124  return;
6125  } else if (gtid == KMP_GTID_DNE) {
6126  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6127  "shutdown\n"));
6128  /* we don't know who we are, but we may still shutdown the library */
6129  } else if (KMP_UBER_GTID(gtid)) {
6130  /* unregister ourselves as an uber thread. gtid is no longer valid */
6131  if (__kmp_root[gtid]->r.r_active) {
6132  __kmp_global.g.g_abort = -1;
6133  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6134  KA_TRACE(10,
6135  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6136  gtid));
6137  return;
6138  } else {
6139  KA_TRACE(
6140  10,
6141  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6142  __kmp_unregister_root_current_thread(gtid);
6143  }
6144  } else {
6145 /* worker threads may call this function through the atexit handler, if they
6146  * call exit() */
6147 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6148  TODO: do a thorough shutdown instead */
6149 #ifdef DUMP_DEBUG_ON_EXIT
6150  if (__kmp_debug_buf)
6151  __kmp_dump_debug_buffer();
6152 #endif
6153  return;
6154  }
6155  }
6156  /* synchronize the termination process */
6157  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6158 
6159  /* have we already finished */
6160  if (__kmp_global.g.g_abort) {
6161  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6162  /* TODO abort? */
6163  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6164  return;
6165  }
6166  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6167  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6168  return;
6169  }
6170 
6171  /* We need this lock to enforce mutex between this reading of
6172  __kmp_threads_capacity and the writing by __kmp_register_root.
6173  Alternatively, we can use a counter of roots that is atomically updated by
6174  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6175  __kmp_internal_end_*. */
6176  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6177 
6178  /* now we can safely conduct the actual termination */
6179  __kmp_internal_end();
6180 
6181  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6182  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6183 
6184  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6185 
6186 #ifdef DUMP_DEBUG_ON_EXIT
6187  if (__kmp_debug_buf)
6188  __kmp_dump_debug_buffer();
6189 #endif
6190 
6191 #if KMP_OS_WINDOWS
6192  __kmp_close_console();
6193 #endif
6194 
6195  __kmp_fini_allocator();
6196 
6197 } // __kmp_internal_end_library
6198 
6199 void __kmp_internal_end_thread(int gtid_req) {
6200  int i;
6201 
6202  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6203  /* this shouldn't be a race condition because __kmp_internal_end() is the
6204  * only place to clear __kmp_serial_init */
6205  /* we'll check this later too, after we get the lock */
6206  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6207  // redundant, because the next check will work in any case.
6208  if (__kmp_global.g.g_abort) {
6209  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6210  /* TODO abort? */
6211  return;
6212  }
6213  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6214  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6215  return;
6216  }
6217 
6218  KMP_MB(); /* Flush all pending memory write invalidates. */
6219 
6220  /* find out who we are and what we should do */
6221  {
6222  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6223  KA_TRACE(10,
6224  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6225  if (gtid == KMP_GTID_SHUTDOWN) {
6226  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6227  "already shutdown\n"));
6228  return;
6229  } else if (gtid == KMP_GTID_MONITOR) {
6230  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6231  "registered, or system shutdown\n"));
6232  return;
6233  } else if (gtid == KMP_GTID_DNE) {
6234  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6235  "shutdown\n"));
6236  return;
6237  /* we don't know who we are */
6238  } else if (KMP_UBER_GTID(gtid)) {
6239  /* unregister ourselves as an uber thread. gtid is no longer valid */
6240  if (__kmp_root[gtid]->r.r_active) {
6241  __kmp_global.g.g_abort = -1;
6242  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6243  KA_TRACE(10,
6244  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6245  gtid));
6246  return;
6247  } else {
6248  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6249  gtid));
6250  __kmp_unregister_root_current_thread(gtid);
6251  }
6252  } else {
6253  /* just a worker thread, let's leave */
6254  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6255 
6256  if (gtid >= 0) {
6257  __kmp_threads[gtid]->th.th_task_team = NULL;
6258  }
6259 
6260  KA_TRACE(10,
6261  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6262  gtid));
6263  return;
6264  }
6265  }
6266 #if KMP_DYNAMIC_LIB
6267  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6268  // thread, because we will better shutdown later in the library destructor.
6269  // The reason of this change is performance problem when non-openmp thread in
6270  // a loop forks and joins many openmp threads. We can save a lot of time
6271  // keeping worker threads alive until the program shutdown.
6272  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6273  // and Windows(DPD200287443) that occurs when using critical sections from
6274  // foreign threads.
6275  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6276  return;
6277 #endif
6278  /* synchronize the termination process */
6279  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6280 
6281  /* have we already finished */
6282  if (__kmp_global.g.g_abort) {
6283  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6284  /* TODO abort? */
6285  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6286  return;
6287  }
6288  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6289  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6290  return;
6291  }
6292 
6293  /* We need this lock to enforce mutex between this reading of
6294  __kmp_threads_capacity and the writing by __kmp_register_root.
6295  Alternatively, we can use a counter of roots that is atomically updated by
6296  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6297  __kmp_internal_end_*. */
6298 
6299  /* should we finish the run-time? are all siblings done? */
6300  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6301 
6302  for (i = 0; i < __kmp_threads_capacity; ++i) {
6303  if (KMP_UBER_GTID(i)) {
6304  KA_TRACE(
6305  10,
6306  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6307  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6308  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6309  return;
6310  }
6311  }
6312 
6313  /* now we can safely conduct the actual termination */
6314 
6315  __kmp_internal_end();
6316 
6317  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6318  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6319 
6320  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6321 
6322 #ifdef DUMP_DEBUG_ON_EXIT
6323  if (__kmp_debug_buf)
6324  __kmp_dump_debug_buffer();
6325 #endif
6326 } // __kmp_internal_end_thread
6327 
6328 // -----------------------------------------------------------------------------
6329 // Library registration stuff.
6330 
6331 static long __kmp_registration_flag = 0;
6332 // Random value used to indicate library initialization.
6333 static char *__kmp_registration_str = NULL;
6334 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6335 
6336 static inline char *__kmp_reg_status_name() {
6337  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6338  each thread. If registration and unregistration go in different threads
6339  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6340  env var can not be found, because the name will contain different pid. */
6341  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6342 } // __kmp_reg_status_get
6343 
6344 void __kmp_register_library_startup(void) {
6345 
6346  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6347  int done = 0;
6348  union {
6349  double dtime;
6350  long ltime;
6351  } time;
6352 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6353  __kmp_initialize_system_tick();
6354 #endif
6355  __kmp_read_system_time(&time.dtime);
6356  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6357  __kmp_registration_str =
6358  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6359  __kmp_registration_flag, KMP_LIBRARY_FILE);
6360 
6361  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6362  __kmp_registration_str));
6363 
6364  while (!done) {
6365 
6366  char *value = NULL; // Actual value of the environment variable.
6367 
6368  // Set environment variable, but do not overwrite if it is exist.
6369  __kmp_env_set(name, __kmp_registration_str, 0);
6370  // Check the variable is written.
6371  value = __kmp_env_get(name);
6372  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6373 
6374  done = 1; // Ok, environment variable set successfully, exit the loop.
6375 
6376  } else {
6377 
6378  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6379  // Check whether it alive or dead.
6380  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6381  char *tail = value;
6382  char *flag_addr_str = NULL;
6383  char *flag_val_str = NULL;
6384  char const *file_name = NULL;
6385  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6386  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6387  file_name = tail;
6388  if (tail != NULL) {
6389  long *flag_addr = 0;
6390  long flag_val = 0;
6391  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6392  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6393  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6394  // First, check whether environment-encoded address is mapped into
6395  // addr space.
6396  // If so, dereference it to see if it still has the right value.
6397  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6398  neighbor = 1;
6399  } else {
6400  // If not, then we know the other copy of the library is no longer
6401  // running.
6402  neighbor = 2;
6403  }
6404  }
6405  }
6406  switch (neighbor) {
6407  case 0: // Cannot parse environment variable -- neighbor status unknown.
6408  // Assume it is the incompatible format of future version of the
6409  // library. Assume the other library is alive.
6410  // WARN( ... ); // TODO: Issue a warning.
6411  file_name = "unknown library";
6412  // Attention! Falling to the next case. That's intentional.
6413  case 1: { // Neighbor is alive.
6414  // Check it is allowed.
6415  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6416  if (!__kmp_str_match_true(duplicate_ok)) {
6417  // That's not allowed. Issue fatal error.
6418  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6419  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6420  }
6421  KMP_INTERNAL_FREE(duplicate_ok);
6422  __kmp_duplicate_library_ok = 1;
6423  done = 1; // Exit the loop.
6424  } break;
6425  case 2: { // Neighbor is dead.
6426  // Clear the variable and try to register library again.
6427  __kmp_env_unset(name);
6428  } break;
6429  default: { KMP_DEBUG_ASSERT(0); } break;
6430  }
6431  }
6432  KMP_INTERNAL_FREE((void *)value);
6433  }
6434  KMP_INTERNAL_FREE((void *)name);
6435 
6436 } // func __kmp_register_library_startup
6437 
6438 void __kmp_unregister_library(void) {
6439 
6440  char *name = __kmp_reg_status_name();
6441  char *value = __kmp_env_get(name);
6442 
6443  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6444  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6445  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6446  // Ok, this is our variable. Delete it.
6447  __kmp_env_unset(name);
6448  }
6449 
6450  KMP_INTERNAL_FREE(__kmp_registration_str);
6451  KMP_INTERNAL_FREE(value);
6452  KMP_INTERNAL_FREE(name);
6453 
6454  __kmp_registration_flag = 0;
6455  __kmp_registration_str = NULL;
6456 
6457 } // __kmp_unregister_library
6458 
6459 // End of Library registration stuff.
6460 // -----------------------------------------------------------------------------
6461 
6462 #if KMP_MIC_SUPPORTED
6463 
6464 static void __kmp_check_mic_type() {
6465  kmp_cpuid_t cpuid_state = {0};
6466  kmp_cpuid_t *cs_p = &cpuid_state;
6467  __kmp_x86_cpuid(1, 0, cs_p);
6468  // We don't support mic1 at the moment
6469  if ((cs_p->eax & 0xff0) == 0xB10) {
6470  __kmp_mic_type = mic2;
6471  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6472  __kmp_mic_type = mic3;
6473  } else {
6474  __kmp_mic_type = non_mic;
6475  }
6476 }
6477 
6478 #endif /* KMP_MIC_SUPPORTED */
6479 
6480 static void __kmp_do_serial_initialize(void) {
6481  int i, gtid;
6482  int size;
6483 
6484  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6485 
6486  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6487  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6488  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6489  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6490  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6491 
6492 #if OMPT_SUPPORT
6493  ompt_pre_init();
6494 #endif
6495 
6496  __kmp_validate_locks();
6497 
6498  /* Initialize internal memory allocator */
6499  __kmp_init_allocator();
6500 
6501  /* Register the library startup via an environment variable and check to see
6502  whether another copy of the library is already registered. */
6503 
6504  __kmp_register_library_startup();
6505 
6506  /* TODO reinitialization of library */
6507  if (TCR_4(__kmp_global.g.g_done)) {
6508  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6509  }
6510 
6511  __kmp_global.g.g_abort = 0;
6512  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6513 
6514 /* initialize the locks */
6515 #if KMP_USE_ADAPTIVE_LOCKS
6516 #if KMP_DEBUG_ADAPTIVE_LOCKS
6517  __kmp_init_speculative_stats();
6518 #endif
6519 #endif
6520 #if KMP_STATS_ENABLED
6521  __kmp_stats_init();
6522 #endif
6523  __kmp_init_lock(&__kmp_global_lock);
6524  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6525  __kmp_init_lock(&__kmp_debug_lock);
6526  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6527  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6528  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6529  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6530  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6531  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6532  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6533  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6534  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6535  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6536  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6537  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6538  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6539  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6540  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6541 #if KMP_USE_MONITOR
6542  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6543 #endif
6544  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6545 
6546  /* conduct initialization and initial setup of configuration */
6547 
6548  __kmp_runtime_initialize();
6549 
6550 #if KMP_MIC_SUPPORTED
6551  __kmp_check_mic_type();
6552 #endif
6553 
6554 // Some global variable initialization moved here from kmp_env_initialize()
6555 #ifdef KMP_DEBUG
6556  kmp_diag = 0;
6557 #endif
6558  __kmp_abort_delay = 0;
6559 
6560  // From __kmp_init_dflt_team_nth()
6561  /* assume the entire machine will be used */
6562  __kmp_dflt_team_nth_ub = __kmp_xproc;
6563  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6564  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6565  }
6566  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6567  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6568  }
6569  __kmp_max_nth = __kmp_sys_max_nth;
6570  __kmp_cg_max_nth = __kmp_sys_max_nth;
6571  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6572  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6573  __kmp_teams_max_nth = __kmp_sys_max_nth;
6574  }
6575 
6576  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6577  // part
6578  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6579 #if KMP_USE_MONITOR
6580  __kmp_monitor_wakeups =
6581  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6582  __kmp_bt_intervals =
6583  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6584 #endif
6585  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6586  __kmp_library = library_throughput;
6587  // From KMP_SCHEDULE initialization
6588  __kmp_static = kmp_sch_static_balanced;
6589 // AC: do not use analytical here, because it is non-monotonous
6590 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6591 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6592 // need to repeat assignment
6593 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6594 // bit control and barrier method control parts
6595 #if KMP_FAST_REDUCTION_BARRIER
6596 #define kmp_reduction_barrier_gather_bb ((int)1)
6597 #define kmp_reduction_barrier_release_bb ((int)1)
6598 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6599 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6600 #endif // KMP_FAST_REDUCTION_BARRIER
6601  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6602  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6603  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6604  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6605  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6606 #if KMP_FAST_REDUCTION_BARRIER
6607  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6608  // lin_64 ): hyper,1
6609  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6610  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6611  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6612  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6613  }
6614 #endif // KMP_FAST_REDUCTION_BARRIER
6615  }
6616 #if KMP_FAST_REDUCTION_BARRIER
6617 #undef kmp_reduction_barrier_release_pat
6618 #undef kmp_reduction_barrier_gather_pat
6619 #undef kmp_reduction_barrier_release_bb
6620 #undef kmp_reduction_barrier_gather_bb
6621 #endif // KMP_FAST_REDUCTION_BARRIER
6622 #if KMP_MIC_SUPPORTED
6623  if (__kmp_mic_type == mic2) { // KNC
6624  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6625  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6626  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6627  1; // forkjoin release
6628  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6629  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6630  }
6631 #if KMP_FAST_REDUCTION_BARRIER
6632  if (__kmp_mic_type == mic2) { // KNC
6633  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6634  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6635  }
6636 #endif // KMP_FAST_REDUCTION_BARRIER
6637 #endif // KMP_MIC_SUPPORTED
6638 
6639 // From KMP_CHECKS initialization
6640 #ifdef KMP_DEBUG
6641  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6642 #else
6643  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6644 #endif
6645 
6646  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6647  __kmp_foreign_tp = TRUE;
6648 
6649  __kmp_global.g.g_dynamic = FALSE;
6650  __kmp_global.g.g_dynamic_mode = dynamic_default;
6651 
6652  __kmp_env_initialize(NULL);
6653 
6654 // Print all messages in message catalog for testing purposes.
6655 #ifdef KMP_DEBUG
6656  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6657  if (__kmp_str_match_true(val)) {
6658  kmp_str_buf_t buffer;
6659  __kmp_str_buf_init(&buffer);
6660  __kmp_i18n_dump_catalog(&buffer);
6661  __kmp_printf("%s", buffer.str);
6662  __kmp_str_buf_free(&buffer);
6663  }
6664  __kmp_env_free(&val);
6665 #endif
6666 
6667  __kmp_threads_capacity =
6668  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6669  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6670  __kmp_tp_capacity = __kmp_default_tp_capacity(
6671  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6672 
6673  // If the library is shut down properly, both pools must be NULL. Just in
6674  // case, set them to NULL -- some memory may leak, but subsequent code will
6675  // work even if pools are not freed.
6676  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6677  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6678  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6679  __kmp_thread_pool = NULL;
6680  __kmp_thread_pool_insert_pt = NULL;
6681  __kmp_team_pool = NULL;
6682 
6683  /* Allocate all of the variable sized records */
6684  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6685  * expandable */
6686  /* Since allocation is cache-aligned, just add extra padding at the end */
6687  size =
6688  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6689  CACHE_LINE;
6690  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6691  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6692  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6693 
6694  /* init thread counts */
6695  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6696  0); // Asserts fail if the library is reinitializing and
6697  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6698  __kmp_all_nth = 0;
6699  __kmp_nth = 0;
6700 
6701  /* setup the uber master thread and hierarchy */
6702  gtid = __kmp_register_root(TRUE);
6703  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6704  KMP_ASSERT(KMP_UBER_GTID(gtid));
6705  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6706 
6707  KMP_MB(); /* Flush all pending memory write invalidates. */
6708 
6709  __kmp_common_initialize();
6710 
6711 #if KMP_OS_UNIX
6712  /* invoke the child fork handler */
6713  __kmp_register_atfork();
6714 #endif
6715 
6716 #if !KMP_DYNAMIC_LIB
6717  {
6718  /* Invoke the exit handler when the program finishes, only for static
6719  library. For dynamic library, we already have _fini and DllMain. */
6720  int rc = atexit(__kmp_internal_end_atexit);
6721  if (rc != 0) {
6722  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6723  __kmp_msg_null);
6724  }
6725  }
6726 #endif
6727 
6728 #if KMP_HANDLE_SIGNALS
6729 #if KMP_OS_UNIX
6730  /* NOTE: make sure that this is called before the user installs their own
6731  signal handlers so that the user handlers are called first. this way they
6732  can return false, not call our handler, avoid terminating the library, and
6733  continue execution where they left off. */
6734  __kmp_install_signals(FALSE);
6735 #endif /* KMP_OS_UNIX */
6736 #if KMP_OS_WINDOWS
6737  __kmp_install_signals(TRUE);
6738 #endif /* KMP_OS_WINDOWS */
6739 #endif
6740 
6741  /* we have finished the serial initialization */
6742  __kmp_init_counter++;
6743 
6744  __kmp_init_serial = TRUE;
6745 
6746  if (__kmp_settings) {
6747  __kmp_env_print();
6748  }
6749 
6750 #if OMP_40_ENABLED
6751  if (__kmp_display_env || __kmp_display_env_verbose) {
6752  __kmp_env_print_2();
6753  }
6754 #endif // OMP_40_ENABLED
6755 
6756 #if OMPT_SUPPORT
6757  ompt_post_init();
6758 #endif
6759 
6760  KMP_MB();
6761 
6762  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6763 }
6764 
6765 void __kmp_serial_initialize(void) {
6766  if (__kmp_init_serial) {
6767  return;
6768  }
6769  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6770  if (__kmp_init_serial) {
6771  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6772  return;
6773  }
6774  __kmp_do_serial_initialize();
6775  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6776 }
6777 
6778 static void __kmp_do_middle_initialize(void) {
6779  int i, j;
6780  int prev_dflt_team_nth;
6781 
6782  if (!__kmp_init_serial) {
6783  __kmp_do_serial_initialize();
6784  }
6785 
6786  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6787 
6788  // Save the previous value for the __kmp_dflt_team_nth so that
6789  // we can avoid some reinitialization if it hasn't changed.
6790  prev_dflt_team_nth = __kmp_dflt_team_nth;
6791 
6792 #if KMP_AFFINITY_SUPPORTED
6793  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6794  // number of cores on the machine.
6795  __kmp_affinity_initialize();
6796 
6797  // Run through the __kmp_threads array and set the affinity mask
6798  // for each root thread that is currently registered with the RTL.
6799  for (i = 0; i < __kmp_threads_capacity; i++) {
6800  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6801  __kmp_affinity_set_init_mask(i, TRUE);
6802  }
6803  }
6804 #endif /* KMP_AFFINITY_SUPPORTED */
6805 
6806  KMP_ASSERT(__kmp_xproc > 0);
6807  if (__kmp_avail_proc == 0) {
6808  __kmp_avail_proc = __kmp_xproc;
6809  }
6810 
6811  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6812  // correct them now
6813  j = 0;
6814  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6815  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6816  __kmp_avail_proc;
6817  j++;
6818  }
6819 
6820  if (__kmp_dflt_team_nth == 0) {
6821 #ifdef KMP_DFLT_NTH_CORES
6822  // Default #threads = #cores
6823  __kmp_dflt_team_nth = __kmp_ncores;
6824  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6825  "__kmp_ncores (%d)\n",
6826  __kmp_dflt_team_nth));
6827 #else
6828  // Default #threads = #available OS procs
6829  __kmp_dflt_team_nth = __kmp_avail_proc;
6830  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6831  "__kmp_avail_proc(%d)\n",
6832  __kmp_dflt_team_nth));
6833 #endif /* KMP_DFLT_NTH_CORES */
6834  }
6835 
6836  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6837  __kmp_dflt_team_nth = KMP_MIN_NTH;
6838  }
6839  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6840  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6841  }
6842 
6843  // There's no harm in continuing if the following check fails,
6844  // but it indicates an error in the previous logic.
6845  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6846 
6847  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6848  // Run through the __kmp_threads array and set the num threads icv for each
6849  // root thread that is currently registered with the RTL (which has not
6850  // already explicitly set its nthreads-var with a call to
6851  // omp_set_num_threads()).
6852  for (i = 0; i < __kmp_threads_capacity; i++) {
6853  kmp_info_t *thread = __kmp_threads[i];
6854  if (thread == NULL)
6855  continue;
6856  if (thread->th.th_current_task->td_icvs.nproc != 0)
6857  continue;
6858 
6859  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6860  }
6861  }
6862  KA_TRACE(
6863  20,
6864  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6865  __kmp_dflt_team_nth));
6866 
6867 #ifdef KMP_ADJUST_BLOCKTIME
6868  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6869  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6870  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6871  if (__kmp_nth > __kmp_avail_proc) {
6872  __kmp_zero_bt = TRUE;
6873  }
6874  }
6875 #endif /* KMP_ADJUST_BLOCKTIME */
6876 
6877  /* we have finished middle initialization */
6878  TCW_SYNC_4(__kmp_init_middle, TRUE);
6879 
6880  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6881 }
6882 
6883 void __kmp_middle_initialize(void) {
6884  if (__kmp_init_middle) {
6885  return;
6886  }
6887  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6888  if (__kmp_init_middle) {
6889  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6890  return;
6891  }
6892  __kmp_do_middle_initialize();
6893  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6894 }
6895 
6896 void __kmp_parallel_initialize(void) {
6897  int gtid = __kmp_entry_gtid(); // this might be a new root
6898 
6899  /* synchronize parallel initialization (for sibling) */
6900  if (TCR_4(__kmp_init_parallel))
6901  return;
6902  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6903  if (TCR_4(__kmp_init_parallel)) {
6904  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6905  return;
6906  }
6907 
6908  /* TODO reinitialization after we have already shut down */
6909  if (TCR_4(__kmp_global.g.g_done)) {
6910  KA_TRACE(
6911  10,
6912  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6913  __kmp_infinite_loop();
6914  }
6915 
6916  /* jc: The lock __kmp_initz_lock is already held, so calling
6917  __kmp_serial_initialize would cause a deadlock. So we call
6918  __kmp_do_serial_initialize directly. */
6919  if (!__kmp_init_middle) {
6920  __kmp_do_middle_initialize();
6921  }
6922 
6923  /* begin initialization */
6924  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6925  KMP_ASSERT(KMP_UBER_GTID(gtid));
6926 
6927 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6928  // Save the FP control regs.
6929  // Worker threads will set theirs to these values at thread startup.
6930  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6931  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6932  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6933 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6934 
6935 #if KMP_OS_UNIX
6936 #if KMP_HANDLE_SIGNALS
6937  /* must be after __kmp_serial_initialize */
6938  __kmp_install_signals(TRUE);
6939 #endif
6940 #endif
6941 
6942  __kmp_suspend_initialize();
6943 
6944 #if defined(USE_LOAD_BALANCE)
6945  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6946  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6947  }
6948 #else
6949  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6950  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6951  }
6952 #endif
6953 
6954  if (__kmp_version) {
6955  __kmp_print_version_2();
6956  }
6957 
6958  /* we have finished parallel initialization */
6959  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6960 
6961  KMP_MB();
6962  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6963 
6964  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6965 }
6966 
6967 /* ------------------------------------------------------------------------ */
6968 
6969 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6970  kmp_team_t *team) {
6971  kmp_disp_t *dispatch;
6972 
6973  KMP_MB();
6974 
6975  /* none of the threads have encountered any constructs, yet. */
6976  this_thr->th.th_local.this_construct = 0;
6977 #if KMP_CACHE_MANAGE
6978  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6979 #endif /* KMP_CACHE_MANAGE */
6980  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6981  KMP_DEBUG_ASSERT(dispatch);
6982  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6983  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6984  // this_thr->th.th_info.ds.ds_tid ] );
6985 
6986  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6987 #if OMP_45_ENABLED
6988  dispatch->th_doacross_buf_idx =
6989  0; /* reset the doacross dispatch buffer counter */
6990 #endif
6991  if (__kmp_env_consistency_check)
6992  __kmp_push_parallel(gtid, team->t.t_ident);
6993 
6994  KMP_MB(); /* Flush all pending memory write invalidates. */
6995 }
6996 
6997 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6998  kmp_team_t *team) {
6999  if (__kmp_env_consistency_check)
7000  __kmp_pop_parallel(gtid, team->t.t_ident);
7001 
7002  __kmp_finish_implicit_task(this_thr);
7003 }
7004 
7005 int __kmp_invoke_task_func(int gtid) {
7006  int rc;
7007  int tid = __kmp_tid_from_gtid(gtid);
7008  kmp_info_t *this_thr = __kmp_threads[gtid];
7009  kmp_team_t *team = this_thr->th.th_team;
7010 
7011  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7012 #if USE_ITT_BUILD
7013  if (__itt_stack_caller_create_ptr) {
7014  __kmp_itt_stack_callee_enter(
7015  (__itt_caller)
7016  team->t.t_stack_id); // inform ittnotify about entering user's code
7017  }
7018 #endif /* USE_ITT_BUILD */
7019 #if INCLUDE_SSC_MARKS
7020  SSC_MARK_INVOKING();
7021 #endif
7022 
7023 #if OMPT_SUPPORT
7024  void *dummy;
7025  void **exit_runtime_p;
7026  ompt_data_t *my_task_data;
7027  ompt_data_t *my_parallel_data;
7028  int ompt_team_size;
7029 
7030  if (ompt_enabled.enabled) {
7031  exit_runtime_p = &(
7032  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7033  } else {
7034  exit_runtime_p = &dummy;
7035  }
7036 
7037  my_task_data =
7038  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7039  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7040  if (ompt_enabled.ompt_callback_implicit_task) {
7041  ompt_team_size = team->t.t_nproc;
7042  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7043  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7044  __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7045  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7046  }
7047 #endif
7048 
7049  {
7050  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
7051  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
7052  rc =
7053  __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7054  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7055 #if OMPT_SUPPORT
7056  ,
7057  exit_runtime_p
7058 #endif
7059  );
7060 #if OMPT_SUPPORT
7061  *exit_runtime_p = NULL;
7062 #endif
7063  }
7064 
7065 #if USE_ITT_BUILD
7066  if (__itt_stack_caller_create_ptr) {
7067  __kmp_itt_stack_callee_leave(
7068  (__itt_caller)
7069  team->t.t_stack_id); // inform ittnotify about leaving user's code
7070  }
7071 #endif /* USE_ITT_BUILD */
7072  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7073 
7074  return rc;
7075 }
7076 
7077 #if OMP_40_ENABLED
7078 void __kmp_teams_master(int gtid) {
7079  // This routine is called by all master threads in teams construct
7080  kmp_info_t *thr = __kmp_threads[gtid];
7081  kmp_team_t *team = thr->th.th_team;
7082  ident_t *loc = team->t.t_ident;
7083  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7084  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7085  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7086  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7087  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7088 // Launch league of teams now, but not let workers execute
7089 // (they hang on fork barrier until next parallel)
7090 #if INCLUDE_SSC_MARKS
7091  SSC_MARK_FORKING();
7092 #endif
7093  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7094  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7095  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7096 #if INCLUDE_SSC_MARKS
7097  SSC_MARK_JOINING();
7098 #endif
7099 
7100  // AC: last parameter "1" eliminates join barrier which won't work because
7101  // worker threads are in a fork barrier waiting for more parallel regions
7102  __kmp_join_call(loc, gtid
7103 #if OMPT_SUPPORT
7104  ,
7105  fork_context_intel
7106 #endif
7107  ,
7108  1);
7109 }
7110 
7111 int __kmp_invoke_teams_master(int gtid) {
7112  kmp_info_t *this_thr = __kmp_threads[gtid];
7113  kmp_team_t *team = this_thr->th.th_team;
7114 #if KMP_DEBUG
7115  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7116  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7117  (void *)__kmp_teams_master);
7118 #endif
7119  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7120  __kmp_teams_master(gtid);
7121  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7122  return 1;
7123 }
7124 #endif /* OMP_40_ENABLED */
7125 
7126 /* this sets the requested number of threads for the next parallel region
7127  encountered by this team. since this should be enclosed in the forkjoin
7128  critical section it should avoid race conditions with assymmetrical nested
7129  parallelism */
7130 
7131 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7132  kmp_info_t *thr = __kmp_threads[gtid];
7133 
7134  if (num_threads > 0)
7135  thr->th.th_set_nproc = num_threads;
7136 }
7137 
7138 #if OMP_40_ENABLED
7139 
7140 /* this sets the requested number of teams for the teams region and/or
7141  the number of threads for the next parallel region encountered */
7142 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7143  int num_threads) {
7144  kmp_info_t *thr = __kmp_threads[gtid];
7145  KMP_DEBUG_ASSERT(num_teams >= 0);
7146  KMP_DEBUG_ASSERT(num_threads >= 0);
7147 
7148  if (num_teams == 0)
7149  num_teams = 1; // default number of teams is 1.
7150  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7151  if (!__kmp_reserve_warn) {
7152  __kmp_reserve_warn = 1;
7153  __kmp_msg(kmp_ms_warning,
7154  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7155  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7156  }
7157  num_teams = __kmp_teams_max_nth;
7158  }
7159  // Set number of teams (number of threads in the outer "parallel" of the
7160  // teams)
7161  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7162 
7163  // Remember the number of threads for inner parallel regions
7164  if (num_threads == 0) {
7165  if (!TCR_4(__kmp_init_middle))
7166  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7167  num_threads = __kmp_avail_proc / num_teams;
7168  if (num_teams * num_threads > __kmp_teams_max_nth) {
7169  // adjust num_threads w/o warning as it is not user setting
7170  num_threads = __kmp_teams_max_nth / num_teams;
7171  }
7172  } else {
7173  if (num_teams * num_threads > __kmp_teams_max_nth) {
7174  int new_threads = __kmp_teams_max_nth / num_teams;
7175  if (!__kmp_reserve_warn) { // user asked for too many threads
7176  __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7177  __kmp_msg(kmp_ms_warning,
7178  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7179  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7180  }
7181  num_threads = new_threads;
7182  }
7183  }
7184  thr->th.th_teams_size.nth = num_threads;
7185 }
7186 
7187 // Set the proc_bind var to use in the following parallel region.
7188 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7189  kmp_info_t *thr = __kmp_threads[gtid];
7190  thr->th.th_set_proc_bind = proc_bind;
7191 }
7192 
7193 #endif /* OMP_40_ENABLED */
7194 
7195 /* Launch the worker threads into the microtask. */
7196 
7197 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7198  kmp_info_t *this_thr = __kmp_threads[gtid];
7199 
7200 #ifdef KMP_DEBUG
7201  int f;
7202 #endif /* KMP_DEBUG */
7203 
7204  KMP_DEBUG_ASSERT(team);
7205  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7206  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7207  KMP_MB(); /* Flush all pending memory write invalidates. */
7208 
7209  team->t.t_construct = 0; /* no single directives seen yet */
7210  team->t.t_ordered.dt.t_value =
7211  0; /* thread 0 enters the ordered section first */
7212 
7213  /* Reset the identifiers on the dispatch buffer */
7214  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7215  if (team->t.t_max_nproc > 1) {
7216  int i;
7217  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7218  team->t.t_disp_buffer[i].buffer_index = i;
7219 #if OMP_45_ENABLED
7220  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7221 #endif
7222  }
7223  } else {
7224  team->t.t_disp_buffer[0].buffer_index = 0;
7225 #if OMP_45_ENABLED
7226  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7227 #endif
7228  }
7229 
7230  KMP_MB(); /* Flush all pending memory write invalidates. */
7231  KMP_ASSERT(this_thr->th.th_team == team);
7232 
7233 #ifdef KMP_DEBUG
7234  for (f = 0; f < team->t.t_nproc; f++) {
7235  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7236  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7237  }
7238 #endif /* KMP_DEBUG */
7239 
7240  /* release the worker threads so they may begin working */
7241  __kmp_fork_barrier(gtid, 0);
7242 }
7243 
7244 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7245  kmp_info_t *this_thr = __kmp_threads[gtid];
7246 
7247  KMP_DEBUG_ASSERT(team);
7248  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7249  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7250  KMP_MB(); /* Flush all pending memory write invalidates. */
7251 
7252 /* Join barrier after fork */
7253 
7254 #ifdef KMP_DEBUG
7255  if (__kmp_threads[gtid] &&
7256  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7257  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7258  __kmp_threads[gtid]);
7259  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7260  "team->t.t_nproc=%d\n",
7261  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7262  team->t.t_nproc);
7263  __kmp_print_structure();
7264  }
7265  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7266  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7267 #endif /* KMP_DEBUG */
7268 
7269  __kmp_join_barrier(gtid); /* wait for everyone */
7270 #if OMPT_SUPPORT
7271  if (ompt_enabled.enabled &&
7272  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7273  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7274  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7275  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7276 #if OMPT_OPTIONAL
7277  void *codeptr = NULL;
7278  if (KMP_MASTER_TID(ds_tid) &&
7279  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7280  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7281  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7282 
7283  if (ompt_enabled.ompt_callback_sync_region_wait) {
7284  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7285  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7286  }
7287  if (ompt_enabled.ompt_callback_sync_region) {
7288  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7289  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7290  }
7291 #endif
7292  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7293  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7294  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7295  }
7296  }
7297 #endif
7298 
7299  KMP_MB(); /* Flush all pending memory write invalidates. */
7300  KMP_ASSERT(this_thr->th.th_team == team);
7301 }
7302 
7303 /* ------------------------------------------------------------------------ */
7304 
7305 #ifdef USE_LOAD_BALANCE
7306 
7307 // Return the worker threads actively spinning in the hot team, if we
7308 // are at the outermost level of parallelism. Otherwise, return 0.
7309 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7310  int i;
7311  int retval;
7312  kmp_team_t *hot_team;
7313 
7314  if (root->r.r_active) {
7315  return 0;
7316  }
7317  hot_team = root->r.r_hot_team;
7318  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7319  return hot_team->t.t_nproc - 1; // Don't count master thread
7320  }
7321 
7322  // Skip the master thread - it is accounted for elsewhere.
7323  retval = 0;
7324  for (i = 1; i < hot_team->t.t_nproc; i++) {
7325  if (hot_team->t.t_threads[i]->th.th_active) {
7326  retval++;
7327  }
7328  }
7329  return retval;
7330 }
7331 
7332 // Perform an automatic adjustment to the number of
7333 // threads used by the next parallel region.
7334 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7335  int retval;
7336  int pool_active;
7337  int hot_team_active;
7338  int team_curr_active;
7339  int system_active;
7340 
7341  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7342  set_nproc));
7343  KMP_DEBUG_ASSERT(root);
7344  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7345  ->th.th_current_task->td_icvs.dynamic == TRUE);
7346  KMP_DEBUG_ASSERT(set_nproc > 1);
7347 
7348  if (set_nproc == 1) {
7349  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7350  return 1;
7351  }
7352 
7353  // Threads that are active in the thread pool, active in the hot team for this
7354  // particular root (if we are at the outer par level), and the currently
7355  // executing thread (to become the master) are available to add to the new
7356  // team, but are currently contributing to the system load, and must be
7357  // accounted for.
7358  pool_active = __kmp_thread_pool_active_nth;
7359  hot_team_active = __kmp_active_hot_team_nproc(root);
7360  team_curr_active = pool_active + hot_team_active + 1;
7361 
7362  // Check the system load.
7363  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7364  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7365  "hot team active = %d\n",
7366  system_active, pool_active, hot_team_active));
7367 
7368  if (system_active < 0) {
7369  // There was an error reading the necessary info from /proc, so use the
7370  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7371  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7372  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7373  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7374 
7375  // Make this call behave like the thread limit algorithm.
7376  retval = __kmp_avail_proc - __kmp_nth +
7377  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7378  if (retval > set_nproc) {
7379  retval = set_nproc;
7380  }
7381  if (retval < KMP_MIN_NTH) {
7382  retval = KMP_MIN_NTH;
7383  }
7384 
7385  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7386  retval));
7387  return retval;
7388  }
7389 
7390  // There is a slight delay in the load balance algorithm in detecting new
7391  // running procs. The real system load at this instant should be at least as
7392  // large as the #active omp thread that are available to add to the team.
7393  if (system_active < team_curr_active) {
7394  system_active = team_curr_active;
7395  }
7396  retval = __kmp_avail_proc - system_active + team_curr_active;
7397  if (retval > set_nproc) {
7398  retval = set_nproc;
7399  }
7400  if (retval < KMP_MIN_NTH) {
7401  retval = KMP_MIN_NTH;
7402  }
7403 
7404  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7405  return retval;
7406 } // __kmp_load_balance_nproc()
7407 
7408 #endif /* USE_LOAD_BALANCE */
7409 
7410 /* ------------------------------------------------------------------------ */
7411 
7412 /* NOTE: this is called with the __kmp_init_lock held */
7413 void __kmp_cleanup(void) {
7414  int f;
7415 
7416  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7417 
7418  if (TCR_4(__kmp_init_parallel)) {
7419 #if KMP_HANDLE_SIGNALS
7420  __kmp_remove_signals();
7421 #endif
7422  TCW_4(__kmp_init_parallel, FALSE);
7423  }
7424 
7425  if (TCR_4(__kmp_init_middle)) {
7426 #if KMP_AFFINITY_SUPPORTED
7427  __kmp_affinity_uninitialize();
7428 #endif /* KMP_AFFINITY_SUPPORTED */
7429  __kmp_cleanup_hierarchy();
7430  TCW_4(__kmp_init_middle, FALSE);
7431  }
7432 
7433  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7434 
7435  if (__kmp_init_serial) {
7436  __kmp_runtime_destroy();
7437  __kmp_init_serial = FALSE;
7438  }
7439 
7440  __kmp_cleanup_threadprivate_caches();
7441 
7442  for (f = 0; f < __kmp_threads_capacity; f++) {
7443  if (__kmp_root[f] != NULL) {
7444  __kmp_free(__kmp_root[f]);
7445  __kmp_root[f] = NULL;
7446  }
7447  }
7448  __kmp_free(__kmp_threads);
7449  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7450  // there is no need in freeing __kmp_root.
7451  __kmp_threads = NULL;
7452  __kmp_root = NULL;
7453  __kmp_threads_capacity = 0;
7454 
7455 #if KMP_USE_DYNAMIC_LOCK
7456  __kmp_cleanup_indirect_user_locks();
7457 #else
7458  __kmp_cleanup_user_locks();
7459 #endif
7460 
7461 #if KMP_AFFINITY_SUPPORTED
7462  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7463  __kmp_cpuinfo_file = NULL;
7464 #endif /* KMP_AFFINITY_SUPPORTED */
7465 
7466 #if KMP_USE_ADAPTIVE_LOCKS
7467 #if KMP_DEBUG_ADAPTIVE_LOCKS
7468  __kmp_print_speculative_stats();
7469 #endif
7470 #endif
7471  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7472  __kmp_nested_nth.nth = NULL;
7473  __kmp_nested_nth.size = 0;
7474  __kmp_nested_nth.used = 0;
7475  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7476  __kmp_nested_proc_bind.bind_types = NULL;
7477  __kmp_nested_proc_bind.size = 0;
7478  __kmp_nested_proc_bind.used = 0;
7479 #if OMP_50_ENABLED
7480  if (__kmp_affinity_format) {
7481  KMP_INTERNAL_FREE(__kmp_affinity_format);
7482  __kmp_affinity_format = NULL;
7483  }
7484 #endif
7485 
7486  __kmp_i18n_catclose();
7487 
7488 #if KMP_USE_HIER_SCHED
7489  __kmp_hier_scheds.deallocate();
7490 #endif
7491 
7492 #if KMP_STATS_ENABLED
7493  __kmp_stats_fini();
7494 #endif
7495 
7496  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7497 }
7498 
7499 /* ------------------------------------------------------------------------ */
7500 
7501 int __kmp_ignore_mppbeg(void) {
7502  char *env;
7503 
7504  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7505  if (__kmp_str_match_false(env))
7506  return FALSE;
7507  }
7508  // By default __kmpc_begin() is no-op.
7509  return TRUE;
7510 }
7511 
7512 int __kmp_ignore_mppend(void) {
7513  char *env;
7514 
7515  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7516  if (__kmp_str_match_false(env))
7517  return FALSE;
7518  }
7519  // By default __kmpc_end() is no-op.
7520  return TRUE;
7521 }
7522 
7523 void __kmp_internal_begin(void) {
7524  int gtid;
7525  kmp_root_t *root;
7526 
7527  /* this is a very important step as it will register new sibling threads
7528  and assign these new uber threads a new gtid */
7529  gtid = __kmp_entry_gtid();
7530  root = __kmp_threads[gtid]->th.th_root;
7531  KMP_ASSERT(KMP_UBER_GTID(gtid));
7532 
7533  if (root->r.r_begin)
7534  return;
7535  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7536  if (root->r.r_begin) {
7537  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7538  return;
7539  }
7540 
7541  root->r.r_begin = TRUE;
7542 
7543  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7544 }
7545 
7546 /* ------------------------------------------------------------------------ */
7547 
7548 void __kmp_user_set_library(enum library_type arg) {
7549  int gtid;
7550  kmp_root_t *root;
7551  kmp_info_t *thread;
7552 
7553  /* first, make sure we are initialized so we can get our gtid */
7554 
7555  gtid = __kmp_entry_gtid();
7556  thread = __kmp_threads[gtid];
7557 
7558  root = thread->th.th_root;
7559 
7560  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7561  library_serial));
7562  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7563  thread */
7564  KMP_WARNING(SetLibraryIncorrectCall);
7565  return;
7566  }
7567 
7568  switch (arg) {
7569  case library_serial:
7570  thread->th.th_set_nproc = 0;
7571  set__nproc(thread, 1);
7572  break;
7573  case library_turnaround:
7574  thread->th.th_set_nproc = 0;
7575  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7576  : __kmp_dflt_team_nth_ub);
7577  break;
7578  case library_throughput:
7579  thread->th.th_set_nproc = 0;
7580  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7581  : __kmp_dflt_team_nth_ub);
7582  break;
7583  default:
7584  KMP_FATAL(UnknownLibraryType, arg);
7585  }
7586 
7587  __kmp_aux_set_library(arg);
7588 }
7589 
7590 void __kmp_aux_set_stacksize(size_t arg) {
7591  if (!__kmp_init_serial)
7592  __kmp_serial_initialize();
7593 
7594 #if KMP_OS_DARWIN
7595  if (arg & (0x1000 - 1)) {
7596  arg &= ~(0x1000 - 1);
7597  if (arg + 0x1000) /* check for overflow if we round up */
7598  arg += 0x1000;
7599  }
7600 #endif
7601  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7602 
7603  /* only change the default stacksize before the first parallel region */
7604  if (!TCR_4(__kmp_init_parallel)) {
7605  size_t value = arg; /* argument is in bytes */
7606 
7607  if (value < __kmp_sys_min_stksize)
7608  value = __kmp_sys_min_stksize;
7609  else if (value > KMP_MAX_STKSIZE)
7610  value = KMP_MAX_STKSIZE;
7611 
7612  __kmp_stksize = value;
7613 
7614  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7615  }
7616 
7617  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7618 }
7619 
7620 /* set the behaviour of the runtime library */
7621 /* TODO this can cause some odd behaviour with sibling parallelism... */
7622 void __kmp_aux_set_library(enum library_type arg) {
7623  __kmp_library = arg;
7624 
7625  switch (__kmp_library) {
7626  case library_serial: {
7627  KMP_INFORM(LibraryIsSerial);
7628  (void)__kmp_change_library(TRUE);
7629  } break;
7630  case library_turnaround:
7631  (void)__kmp_change_library(TRUE);
7632  break;
7633  case library_throughput:
7634  (void)__kmp_change_library(FALSE);
7635  break;
7636  default:
7637  KMP_FATAL(UnknownLibraryType, arg);
7638  }
7639 }
7640 
7641 /* Getting team information common for all team API */
7642 // Returns NULL if not in teams construct
7643 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7644  kmp_info_t *thr = __kmp_entry_thread();
7645  teams_serialized = 0;
7646  if (thr->th.th_teams_microtask) {
7647  kmp_team_t *team = thr->th.th_team;
7648  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7649  int ii = team->t.t_level;
7650  teams_serialized = team->t.t_serialized;
7651  int level = tlevel + 1;
7652  KMP_DEBUG_ASSERT(ii >= tlevel);
7653  while (ii > level) {
7654  for (teams_serialized = team->t.t_serialized;
7655  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7656  }
7657  if (team->t.t_serialized && (!teams_serialized)) {
7658  team = team->t.t_parent;
7659  continue;
7660  }
7661  if (ii > level) {
7662  team = team->t.t_parent;
7663  ii--;
7664  }
7665  }
7666  return team;
7667  }
7668  return NULL;
7669 }
7670 
7671 int __kmp_aux_get_team_num() {
7672  int serialized;
7673  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7674  if (team) {
7675  if (serialized > 1) {
7676  return 0; // teams region is serialized ( 1 team of 1 thread ).
7677  } else {
7678  return team->t.t_master_tid;
7679  }
7680  }
7681  return 0;
7682 }
7683 
7684 int __kmp_aux_get_num_teams() {
7685  int serialized;
7686  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7687  if (team) {
7688  if (serialized > 1) {
7689  return 1;
7690  } else {
7691  return team->t.t_parent->t.t_nproc;
7692  }
7693  }
7694  return 1;
7695 }
7696 
7697 /* ------------------------------------------------------------------------ */
7698 
7699 #if OMP_50_ENABLED
7700 /*
7701  * Affinity Format Parser
7702  *
7703  * Field is in form of: %[[[0].]size]type
7704  * % and type are required (%% means print a literal '%')
7705  * type is either single char or long name surrounded by {},
7706  * e.g., N or {num_threads}
7707  * 0 => leading zeros
7708  * . => right justified when size is specified
7709  * by default output is left justified
7710  * size is the *minimum* field length
7711  * All other characters are printed as is
7712  *
7713  * Available field types:
7714  * L {thread_level} - omp_get_level()
7715  * n {thread_num} - omp_get_thread_num()
7716  * h {host} - name of host machine
7717  * P {process_id} - process id (integer)
7718  * T {thread_identifier} - native thread identifier (integer)
7719  * N {num_threads} - omp_get_num_threads()
7720  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7721  * a {thread_affinity} - comma separated list of integers or integer ranges
7722  * (values of affinity mask)
7723  *
7724  * Implementation-specific field types can be added
7725  * If a type is unknown, print "undefined"
7726 */
7727 
7728 // Structure holding the short name, long name, and corresponding data type
7729 // for snprintf. A table of these will represent the entire valid keyword
7730 // field types.
7731 typedef struct kmp_affinity_format_field_t {
7732  char short_name; // from spec e.g., L -> thread level
7733  const char *long_name; // from spec thread_level -> thread level
7734  char field_format; // data type for snprintf (typically 'd' or 's'
7735  // for integer or string)
7736 } kmp_affinity_format_field_t;
7737 
7738 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7739 #if KMP_AFFINITY_SUPPORTED
7740  {'A', "thread_affinity", 's'},
7741 #endif
7742  {'t', "team_num", 'd'},
7743  {'T', "num_teams", 'd'},
7744  {'L', "nesting_level", 'd'},
7745  {'n', "thread_num", 'd'},
7746  {'N', "num_threads", 'd'},
7747  {'a', "ancestor_tnum", 'd'},
7748  {'H', "host", 's'},
7749  {'P', "process_id", 'd'},
7750  {'i', "native_thread_id", 'd'}};
7751 
7752 // Return the number of characters it takes to hold field
7753 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7754  const char **ptr,
7755  kmp_str_buf_t *field_buffer) {
7756  int rc, format_index, field_value;
7757  const char *width_left, *width_right;
7758  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7759  static const int FORMAT_SIZE = 20;
7760  char format[FORMAT_SIZE] = {0};
7761  char absolute_short_name = 0;
7762 
7763  KMP_DEBUG_ASSERT(gtid >= 0);
7764  KMP_DEBUG_ASSERT(th);
7765  KMP_DEBUG_ASSERT(**ptr == '%');
7766  KMP_DEBUG_ASSERT(field_buffer);
7767 
7768  __kmp_str_buf_clear(field_buffer);
7769 
7770  // Skip the initial %
7771  (*ptr)++;
7772 
7773  // Check for %% first
7774  if (**ptr == '%') {
7775  __kmp_str_buf_cat(field_buffer, "%", 1);
7776  (*ptr)++; // skip over the second %
7777  return 1;
7778  }
7779 
7780  // Parse field modifiers if they are present
7781  pad_zeros = false;
7782  if (**ptr == '0') {
7783  pad_zeros = true;
7784  (*ptr)++; // skip over 0
7785  }
7786  right_justify = false;
7787  if (**ptr == '.') {
7788  right_justify = true;
7789  (*ptr)++; // skip over .
7790  }
7791  // Parse width of field: [width_left, width_right)
7792  width_left = width_right = NULL;
7793  if (**ptr >= '0' && **ptr <= '9') {
7794  width_left = *ptr;
7795  SKIP_DIGITS(*ptr);
7796  width_right = *ptr;
7797  }
7798 
7799  // Create the format for KMP_SNPRINTF based on flags parsed above
7800  format_index = 0;
7801  format[format_index++] = '%';
7802  if (!right_justify)
7803  format[format_index++] = '-';
7804  if (pad_zeros)
7805  format[format_index++] = '0';
7806  if (width_left && width_right) {
7807  int i = 0;
7808  // Only allow 8 digit number widths.
7809  // This also prevents overflowing format variable
7810  while (i < 8 && width_left < width_right) {
7811  format[format_index++] = *width_left;
7812  width_left++;
7813  i++;
7814  }
7815  }
7816 
7817  // Parse a name (long or short)
7818  // Canonicalize the name into absolute_short_name
7819  found_valid_name = false;
7820  parse_long_name = (**ptr == '{');
7821  if (parse_long_name)
7822  (*ptr)++; // skip initial left brace
7823  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7824  sizeof(__kmp_affinity_format_table[0]);
7825  ++i) {
7826  char short_name = __kmp_affinity_format_table[i].short_name;
7827  const char *long_name = __kmp_affinity_format_table[i].long_name;
7828  char field_format = __kmp_affinity_format_table[i].field_format;
7829  if (parse_long_name) {
7830  int length = KMP_STRLEN(long_name);
7831  if (strncmp(*ptr, long_name, length) == 0) {
7832  found_valid_name = true;
7833  (*ptr) += length; // skip the long name
7834  }
7835  } else if (**ptr == short_name) {
7836  found_valid_name = true;
7837  (*ptr)++; // skip the short name
7838  }
7839  if (found_valid_name) {
7840  format[format_index++] = field_format;
7841  format[format_index++] = '\0';
7842  absolute_short_name = short_name;
7843  break;
7844  }
7845  }
7846  if (parse_long_name) {
7847  if (**ptr != '}') {
7848  absolute_short_name = 0;
7849  } else {
7850  (*ptr)++; // skip over the right brace
7851  }
7852  }
7853 
7854  // Attempt to fill the buffer with the requested
7855  // value using snprintf within __kmp_str_buf_print()
7856  switch (absolute_short_name) {
7857  case 't':
7858  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7859  break;
7860  case 'T':
7861  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7862  break;
7863  case 'L':
7864  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7865  break;
7866  case 'n':
7867  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7868  break;
7869  case 'H': {
7870  static const int BUFFER_SIZE = 256;
7871  char buf[BUFFER_SIZE];
7872  __kmp_expand_host_name(buf, BUFFER_SIZE);
7873  rc = __kmp_str_buf_print(field_buffer, format, buf);
7874  } break;
7875  case 'P':
7876  rc = __kmp_str_buf_print(field_buffer, format, getpid());
7877  break;
7878  case 'i':
7879  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7880  break;
7881  case 'N':
7882  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7883  break;
7884  case 'a':
7885  field_value =
7886  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7887  rc = __kmp_str_buf_print(field_buffer, format, field_value);
7888  break;
7889 #if KMP_AFFINITY_SUPPORTED
7890  case 'A': {
7891  kmp_str_buf_t buf;
7892  __kmp_str_buf_init(&buf);
7893  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7894  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7895  __kmp_str_buf_free(&buf);
7896  } break;
7897 #endif
7898  default:
7899  // According to spec, If an implementation does not have info for field
7900  // type, then "undefined" is printed
7901  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7902  // Skip the field
7903  if (parse_long_name) {
7904  SKIP_TOKEN(*ptr);
7905  if (**ptr == '}')
7906  (*ptr)++;
7907  } else {
7908  (*ptr)++;
7909  }
7910  }
7911 
7912  KMP_ASSERT(format_index <= FORMAT_SIZE);
7913  return rc;
7914 }
7915 
7916 /*
7917  * Return number of characters needed to hold the affinity string
7918  * (not including null byte character)
7919  * The resultant string is printed to buffer, which the caller can then
7920  * handle afterwards
7921 */
7922 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7923  kmp_str_buf_t *buffer) {
7924  const char *parse_ptr;
7925  size_t retval;
7926  const kmp_info_t *th;
7927  kmp_str_buf_t field;
7928 
7929  KMP_DEBUG_ASSERT(buffer);
7930  KMP_DEBUG_ASSERT(gtid >= 0);
7931 
7932  __kmp_str_buf_init(&field);
7933  __kmp_str_buf_clear(buffer);
7934 
7935  th = __kmp_threads[gtid];
7936  retval = 0;
7937 
7938  // If format is NULL or zero-length string, then we use
7939  // affinity-format-var ICV
7940  parse_ptr = format;
7941  if (parse_ptr == NULL || *parse_ptr == '\0') {
7942  parse_ptr = __kmp_affinity_format;
7943  }
7944  KMP_DEBUG_ASSERT(parse_ptr);
7945 
7946  while (*parse_ptr != '\0') {
7947  // Parse a field
7948  if (*parse_ptr == '%') {
7949  // Put field in the buffer
7950  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7951  __kmp_str_buf_catbuf(buffer, &field);
7952  retval += rc;
7953  } else {
7954  // Put literal character in buffer
7955  __kmp_str_buf_cat(buffer, parse_ptr, 1);
7956  retval++;
7957  parse_ptr++;
7958  }
7959  }
7960  __kmp_str_buf_free(&field);
7961  return retval;
7962 }
7963 
7964 // Displays the affinity string to stdout
7965 void __kmp_aux_display_affinity(int gtid, const char *format) {
7966  kmp_str_buf_t buf;
7967  __kmp_str_buf_init(&buf);
7968  __kmp_aux_capture_affinity(gtid, format, &buf);
7969  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7970  __kmp_str_buf_free(&buf);
7971 }
7972 #endif // OMP_50_ENABLED
7973 
7974 /* ------------------------------------------------------------------------ */
7975 
7976 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7977  int blocktime = arg; /* argument is in milliseconds */
7978 #if KMP_USE_MONITOR
7979  int bt_intervals;
7980 #endif
7981  int bt_set;
7982 
7983  __kmp_save_internal_controls(thread);
7984 
7985  /* Normalize and set blocktime for the teams */
7986  if (blocktime < KMP_MIN_BLOCKTIME)
7987  blocktime = KMP_MIN_BLOCKTIME;
7988  else if (blocktime > KMP_MAX_BLOCKTIME)
7989  blocktime = KMP_MAX_BLOCKTIME;
7990 
7991  set__blocktime_team(thread->th.th_team, tid, blocktime);
7992  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7993 
7994 #if KMP_USE_MONITOR
7995  /* Calculate and set blocktime intervals for the teams */
7996  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7997 
7998  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7999  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8000 #endif
8001 
8002  /* Set whether blocktime has been set to "TRUE" */
8003  bt_set = TRUE;
8004 
8005  set__bt_set_team(thread->th.th_team, tid, bt_set);
8006  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8007 #if KMP_USE_MONITOR
8008  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8009  "bt_intervals=%d, monitor_updates=%d\n",
8010  __kmp_gtid_from_tid(tid, thread->th.th_team),
8011  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8012  __kmp_monitor_wakeups));
8013 #else
8014  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8015  __kmp_gtid_from_tid(tid, thread->th.th_team),
8016  thread->th.th_team->t.t_id, tid, blocktime));
8017 #endif
8018 }
8019 
8020 void __kmp_aux_set_defaults(char const *str, int len) {
8021  if (!__kmp_init_serial) {
8022  __kmp_serial_initialize();
8023  }
8024  __kmp_env_initialize(str);
8025 
8026  if (__kmp_settings
8027 #if OMP_40_ENABLED
8028  || __kmp_display_env || __kmp_display_env_verbose
8029 #endif // OMP_40_ENABLED
8030  ) {
8031  __kmp_env_print();
8032  }
8033 } // __kmp_aux_set_defaults
8034 
8035 /* ------------------------------------------------------------------------ */
8036 /* internal fast reduction routines */
8037 
8038 PACKED_REDUCTION_METHOD_T
8039 __kmp_determine_reduction_method(
8040  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8041  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8042  kmp_critical_name *lck) {
8043 
8044  // Default reduction method: critical construct ( lck != NULL, like in current
8045  // PAROPT )
8046  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8047  // can be selected by RTL
8048  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8049  // can be selected by RTL
8050  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8051  // among generated by PAROPT.
8052 
8053  PACKED_REDUCTION_METHOD_T retval;
8054 
8055  int team_size;
8056 
8057  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8058  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8059 
8060 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8061  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8062 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8063 
8064  retval = critical_reduce_block;
8065 
8066  // another choice of getting a team size (with 1 dynamic deference) is slower
8067  team_size = __kmp_get_team_num_threads(global_tid);
8068  if (team_size == 1) {
8069 
8070  retval = empty_reduce_block;
8071 
8072  } else {
8073 
8074  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8075 
8076 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8077 
8078 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8079  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || KMP_OS_KFREEBSD
8080 
8081  int teamsize_cutoff = 4;
8082 
8083 #if KMP_MIC_SUPPORTED
8084  if (__kmp_mic_type != non_mic) {
8085  teamsize_cutoff = 8;
8086  }
8087 #endif
8088  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8089  if (tree_available) {
8090  if (team_size <= teamsize_cutoff) {
8091  if (atomic_available) {
8092  retval = atomic_reduce_block;
8093  }
8094  } else {
8095  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8096  }
8097  } else if (atomic_available) {
8098  retval = atomic_reduce_block;
8099  }
8100 #else
8101 #error "Unknown or unsupported OS"
8102 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8103  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8104 
8105 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8106 
8107 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_KFREEBSD
8108 
8109  // basic tuning
8110 
8111  if (atomic_available) {
8112  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8113  retval = atomic_reduce_block;
8114  }
8115  } // otherwise: use critical section
8116 
8117 #elif KMP_OS_DARWIN
8118 
8119  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8120  if (atomic_available && (num_vars <= 3)) {
8121  retval = atomic_reduce_block;
8122  } else if (tree_available) {
8123  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8124  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8125  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8126  }
8127  } // otherwise: use critical section
8128 
8129 #else
8130 #error "Unknown or unsupported OS"
8131 #endif
8132 
8133 #else
8134 #error "Unknown or unsupported architecture"
8135 #endif
8136  }
8137 
8138  // KMP_FORCE_REDUCTION
8139 
8140  // If the team is serialized (team_size == 1), ignore the forced reduction
8141  // method and stay with the unsynchronized method (empty_reduce_block)
8142  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8143  team_size != 1) {
8144 
8145  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8146 
8147  int atomic_available, tree_available;
8148 
8149  switch ((forced_retval = __kmp_force_reduction_method)) {
8150  case critical_reduce_block:
8151  KMP_ASSERT(lck); // lck should be != 0
8152  break;
8153 
8154  case atomic_reduce_block:
8155  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8156  if (!atomic_available) {
8157  KMP_WARNING(RedMethodNotSupported, "atomic");
8158  forced_retval = critical_reduce_block;
8159  }
8160  break;
8161 
8162  case tree_reduce_block:
8163  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8164  if (!tree_available) {
8165  KMP_WARNING(RedMethodNotSupported, "tree");
8166  forced_retval = critical_reduce_block;
8167  } else {
8168 #if KMP_FAST_REDUCTION_BARRIER
8169  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8170 #endif
8171  }
8172  break;
8173 
8174  default:
8175  KMP_ASSERT(0); // "unsupported method specified"
8176  }
8177 
8178  retval = forced_retval;
8179  }
8180 
8181  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8182 
8183 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8184 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8185 
8186  return (retval);
8187 }
8188 
8189 // this function is for testing set/get/determine reduce method
8190 kmp_int32 __kmp_get_reduce_method(void) {
8191  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8192 }
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:877
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:919
sched_type
Definition: kmp.h:337
Definition: kmp.h:224
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
kmp_int32 flags
Definition: kmp.h:226