LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMPTARGET_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60  KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68  KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84  int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86  kmp_internal_control_t *new_icvs,
87  ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90  int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96  kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113  int i;
114  kmp_info_t **other_threads;
115  size_t stack_data;
116  char *stack_addr;
117  size_t stack_size;
118  char *stack_base;
119 
120  KA_TRACE(
121  1000,
122  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
123  __kmp_nth, __kmp_all_nth));
124 
125  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128  __kmp_init_gtid for this to work. */
129 
130  if (!TCR_4(__kmp_init_gtid))
131  return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134  if (TCR_4(__kmp_gtid_mode) >= 3) {
135  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136  return __kmp_gtid;
137  }
138 #endif
139  if (TCR_4(__kmp_gtid_mode) >= 2) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141  return __kmp_gtid_get_specific();
142  }
143  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145  stack_addr = (char *)&stack_data;
146  other_threads = __kmp_threads;
147 
148  /* ATT: The code below is a source of potential bugs due to unsynchronized
149  access to __kmp_threads array. For example:
150  1. Current thread loads other_threads[i] to thr and checks it, it is
151  non-NULL.
152  2. Current thread is suspended by OS.
153  3. Another thread unregisters and finishes (debug versions of free()
154  may fill memory with something like 0xEF).
155  4. Current thread is resumed.
156  5. Current thread reads junk from *thr.
157  TODO: Fix it. --ln */
158 
159  for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162  if (!thr)
163  continue;
164 
165  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168  /* stack grows down -- search through all of the active threads */
169 
170  if (stack_addr <= stack_base) {
171  size_t stack_diff = stack_base - stack_addr;
172 
173  if (stack_diff <= stack_size) {
174  /* The only way we can be closer than the allocated */
175  /* stack size is if we are running on this thread. */
176  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177  return i;
178  }
179  }
180  }
181 
182  /* get specific to try and determine our gtid */
183  KA_TRACE(1000,
184  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185  "thread, using TLS\n"));
186  i = __kmp_gtid_get_specific();
187 
188  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
189 
190  /* if we havn't been assigned a gtid, then return code */
191  if (i < 0)
192  return i;
193 
194  /* dynamically updated stack window for uber threads to avoid get_specific
195  call */
196  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197  KMP_FATAL(StackOverflow, i);
198  }
199 
200  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201  if (stack_addr > stack_base) {
202  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205  stack_base);
206  } else {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208  stack_base - stack_addr);
209  }
210 
211  /* Reprint stack bounds for ubermaster since they have been refined */
212  if (__kmp_storage_map) {
213  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216  other_threads[i]->th.th_info.ds.ds_stacksize,
217  "th_%d stack (refinement)", i);
218  }
219  return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223  int gtid;
224 
225  if (!__kmp_init_serial) {
226  gtid = KMP_GTID_DNE;
227  } else
228 #ifdef KMP_TDATA_GTID
229  if (TCR_4(__kmp_gtid_mode) >= 3) {
230  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231  gtid = __kmp_gtid;
232  } else
233 #endif
234  if (TCR_4(__kmp_gtid_mode) >= 2) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236  gtid = __kmp_gtid_get_specific();
237  } else {
238  KA_TRACE(1000,
239  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240  gtid = __kmp_get_global_thread_id();
241  }
242 
243  /* we must be a new uber master sibling thread */
244  if (gtid == KMP_GTID_DNE) {
245  KA_TRACE(10,
246  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247  "Registering a new gtid.\n"));
248  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249  if (!__kmp_init_serial) {
250  __kmp_do_serial_initialize();
251  gtid = __kmp_gtid_get_specific();
252  } else {
253  gtid = __kmp_register_root(FALSE);
254  }
255  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257  }
258 
259  KMP_DEBUG_ASSERT(gtid >= 0);
260 
261  return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266  int f;
267  char *stack_beg = NULL;
268  char *stack_end = NULL;
269  int gtid;
270 
271  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272  if (__kmp_storage_map) {
273  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276  gtid = __kmp_gtid_from_thread(th);
277 
278  if (gtid == KMP_GTID_MONITOR) {
279  __kmp_print_storage_map_gtid(
280  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281  "th_%s stack (%s)", "mon",
282  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283  } else {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%d stack (%s)", gtid,
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  }
289  }
290 
291  /* No point in checking ubermaster threads since they use refinement and
292  * cannot overlap */
293  gtid = __kmp_gtid_from_thread(th);
294  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295  KA_TRACE(10,
296  ("__kmp_check_stack_overlap: performing extensive checking\n"));
297  if (stack_beg == NULL) {
298  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300  }
301 
302  for (f = 0; f < __kmp_threads_capacity; f++) {
303  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305  if (f_th && f_th != th) {
306  char *other_stack_end =
307  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308  char *other_stack_beg =
309  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313  /* Print the other stack values before the abort */
314  if (__kmp_storage_map)
315  __kmp_print_storage_map_gtid(
316  -1, other_stack_beg, other_stack_end,
317  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321  __kmp_msg_null);
322  }
323  }
324  }
325  }
326  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332  static int done = FALSE;
333 
334  while (!done) {
335  KMP_YIELD(TRUE);
336  }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342  char const *format, ...) {
343  char buffer[MAX_MESSAGE];
344  va_list ap;
345 
346  va_start(ap, format);
347  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348  p2, (unsigned long)size, format);
349  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350  __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352  int node;
353  if (gtid >= 0) {
354  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355  if (__kmp_storage_map_verbose) {
356  node = __kmp_get_host_node(p1);
357  if (node < 0) /* doesn't work, so don't try this next time */
358  __kmp_storage_map_verbose = FALSE;
359  else {
360  char *last;
361  int lastNode;
362  int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364  const int page_size = KMP_GET_PAGE_SIZE();
365 
366  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368  if (localProc >= 0)
369  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
370  localProc >> 1);
371  else
372  __kmp_printf_no_lock(" GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374  /* The more elaborate format is disabled for now because of the prctl
375  * hanging bug. */
376  do {
377  last = p1;
378  lastNode = node;
379  /* This loop collates adjacent pages with the same host node. */
380  do {
381  (char *)p1 += page_size;
382  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
384  lastNode);
385  } while (p1 <= p2);
386 #else
387  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
388  (char *)p1 + (page_size - 1),
389  __kmp_get_host_node(p1));
390  if (p1 < p2) {
391  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
392  (char *)p2 + (page_size - 1),
393  __kmp_get_host_node(p2));
394  }
395 #endif
396  }
397  }
398  } else
399  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
400  }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406  char buffer[MAX_MESSAGE];
407  va_list ap;
408 
409  if (__kmp_generate_warnings == kmp_warnings_off) {
410  return;
411  }
412 
413  va_start(ap, format);
414 
415  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417  __kmp_vprintf(kmp_err, buffer, ap);
418  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420  va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424  // Later threads may stall here, but that's ok because abort() will kill them.
425  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427  if (__kmp_debug_buf) {
428  __kmp_dump_debug_buffer();
429  }
430 
431  if (KMP_OS_WINDOWS) {
432  // Let other threads know of abnormal termination and prevent deadlock
433  // if abort happened during library initialization or shutdown
434  __kmp_global.g.g_abort = SIGABRT;
435 
436  /* On Windows* OS by default abort() causes pop-up error box, which stalls
437  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438  boxes. _set_abort_behavior() works well, but this function is not
439  available in VS7 (this is not problem for DLL, but it is a problem for
440  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441  help, at least in some versions of MS C RTL.
442 
443  It seems following sequence is the only way to simulate abort() and
444  avoid pop-up error box. */
445  raise(SIGABRT);
446  _exit(3); // Just in case, if signal ignored, exit anyway.
447  } else {
448  __kmp_unregister_library();
449  abort();
450  }
451 
452  __kmp_infinite_loop();
453  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458  // TODO: Eliminate g_abort global variable and this function.
459  // In case of abort just call abort(), it will kill all the threads.
460  __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464  that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468  gtid);
469 
470  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474  sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476  __kmp_print_storage_map_gtid(
477  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481  &thr->th.th_bar[bs_plain_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483  gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486  &thr->th.th_bar[bs_forkjoin_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488  gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492  &thr->th.th_bar[bs_reduction_barrier + 1],
493  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494  gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499  that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502  int team_id, int num_thr) {
503  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505  header, team_id);
506 
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508  &team->t.t_bar[bs_last_barrier],
509  sizeof(kmp_balign_team_t) * bs_last_barrier,
510  "%s_%d.t_bar", header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513  &team->t.t_bar[bs_plain_barrier + 1],
514  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515  header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518  &team->t.t_bar[bs_forkjoin_barrier + 1],
519  sizeof(kmp_balign_team_t),
520  "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524  &team->t.t_bar[bs_reduction_barrier + 1],
525  sizeof(kmp_balign_team_t),
526  "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529  __kmp_print_storage_map_gtid(
530  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533  __kmp_print_storage_map_gtid(
534  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538  &team->t.t_disp_buffer[num_disp_buff],
539  sizeof(dispatch_shared_info_t) * num_disp_buff,
540  "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() { __kmp_init_memkind(); }
544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
545 
546 /* ------------------------------------------------------------------------ */
547 
548 #if KMP_DYNAMIC_LIB
549 #if KMP_OS_WINDOWS
550 
551 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
552  // TODO: Change to __kmp_break_bootstrap_lock().
553  __kmp_init_bootstrap_lock(lck); // make the lock released
554 }
555 
556 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
557  int i;
558  int thread_count;
559 
560  // PROCESS_DETACH is expected to be called by a thread that executes
561  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
562  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
563  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
564  // threads can be still alive here, although being about to be terminated. The
565  // threads in the array with ds_thread==0 are most suspicious. Actually, it
566  // can be not safe to access the __kmp_threads[].
567 
568  // TODO: does it make sense to check __kmp_roots[] ?
569 
570  // Let's check that there are no other alive threads registered with the OMP
571  // lib.
572  while (1) {
573  thread_count = 0;
574  for (i = 0; i < __kmp_threads_capacity; ++i) {
575  if (!__kmp_threads)
576  continue;
577  kmp_info_t *th = __kmp_threads[i];
578  if (th == NULL)
579  continue;
580  int gtid = th->th.th_info.ds.ds_gtid;
581  if (gtid == gtid_req)
582  continue;
583  if (gtid < 0)
584  continue;
585  DWORD exit_val;
586  int alive = __kmp_is_thread_alive(th, &exit_val);
587  if (alive) {
588  ++thread_count;
589  }
590  }
591  if (thread_count == 0)
592  break; // success
593  }
594 
595  // Assume that I'm alone. Now it might be safe to check and reset locks.
596  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
597  __kmp_reset_lock(&__kmp_forkjoin_lock);
598 #ifdef KMP_DEBUG
599  __kmp_reset_lock(&__kmp_stdio_lock);
600 #endif // KMP_DEBUG
601 }
602 
603 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
604  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
605 
606  switch (fdwReason) {
607 
608  case DLL_PROCESS_ATTACH:
609  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
610 
611  return TRUE;
612 
613  case DLL_PROCESS_DETACH:
614  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
615 
616  if (lpReserved != NULL) {
617  // lpReserved is used for telling the difference:
618  // lpReserved == NULL when FreeLibrary() was called,
619  // lpReserved != NULL when the process terminates.
620  // When FreeLibrary() is called, worker threads remain alive. So they will
621  // release the forkjoin lock by themselves. When the process terminates,
622  // worker threads disappear triggering the problem of unreleased forkjoin
623  // lock as described below.
624 
625  // A worker thread can take the forkjoin lock. The problem comes up if
626  // that worker thread becomes dead before it releases the forkjoin lock.
627  // The forkjoin lock remains taken, while the thread executing
628  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
629  // to take the forkjoin lock and will always fail, so that the application
630  // will never finish [normally]. This scenario is possible if
631  // __kmpc_end() has not been executed. It looks like it's not a corner
632  // case, but common cases:
633  // - the main function was compiled by an alternative compiler;
634  // - the main function was compiled by icl but without /Qopenmp
635  // (application with plugins);
636  // - application terminates by calling C exit(), Fortran CALL EXIT() or
637  // Fortran STOP.
638  // - alive foreign thread prevented __kmpc_end from doing cleanup.
639  //
640  // This is a hack to work around the problem.
641  // TODO: !!! figure out something better.
642  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
643  }
644 
645  __kmp_internal_end_library(__kmp_gtid_get_specific());
646 
647  return TRUE;
648 
649  case DLL_THREAD_ATTACH:
650  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
651 
652  /* if we want to register new siblings all the time here call
653  * __kmp_get_gtid(); */
654  return TRUE;
655 
656  case DLL_THREAD_DETACH:
657  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
658 
659  __kmp_internal_end_thread(__kmp_gtid_get_specific());
660  return TRUE;
661  }
662 
663  return TRUE;
664 }
665 
666 #endif /* KMP_OS_WINDOWS */
667 #endif /* KMP_DYNAMIC_LIB */
668 
669 /* __kmp_parallel_deo -- Wait until it's our turn. */
670 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
671  int gtid = *gtid_ref;
672 #ifdef BUILD_PARALLEL_ORDERED
673  kmp_team_t *team = __kmp_team_from_gtid(gtid);
674 #endif /* BUILD_PARALLEL_ORDERED */
675 
676  if (__kmp_env_consistency_check) {
677  if (__kmp_threads[gtid]->th.th_root->r.r_active)
678 #if KMP_USE_DYNAMIC_LOCK
679  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
680 #else
681  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
682 #endif
683  }
684 #ifdef BUILD_PARALLEL_ORDERED
685  if (!team->t.t_serialized) {
686  KMP_MB();
687  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
688  NULL);
689  KMP_MB();
690  }
691 #endif /* BUILD_PARALLEL_ORDERED */
692 }
693 
694 /* __kmp_parallel_dxo -- Signal the next task. */
695 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696  int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698  int tid = __kmp_tid_from_gtid(gtid);
699  kmp_team_t *team = __kmp_team_from_gtid(gtid);
700 #endif /* BUILD_PARALLEL_ORDERED */
701 
702  if (__kmp_env_consistency_check) {
703  if (__kmp_threads[gtid]->th.th_root->r.r_active)
704  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
705  }
706 #ifdef BUILD_PARALLEL_ORDERED
707  if (!team->t.t_serialized) {
708  KMP_MB(); /* Flush all pending memory write invalidates. */
709 
710  /* use the tid of the next thread in this team */
711  /* TODO replace with general release procedure */
712  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
713 
714  KMP_MB(); /* Flush all pending memory write invalidates. */
715  }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* ------------------------------------------------------------------------ */
720 /* The BARRIER for a SINGLE process section is always explicit */
721 
722 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
723  int status;
724  kmp_info_t *th;
725  kmp_team_t *team;
726 
727  if (!TCR_4(__kmp_init_parallel))
728  __kmp_parallel_initialize();
729  __kmp_resume_if_soft_paused();
730 
731  th = __kmp_threads[gtid];
732  team = th->th.th_team;
733  status = 0;
734 
735  th->th.th_ident = id_ref;
736 
737  if (team->t.t_serialized) {
738  status = 1;
739  } else {
740  kmp_int32 old_this = th->th.th_local.this_construct;
741 
742  ++th->th.th_local.this_construct;
743  /* try to set team count to thread count--success means thread got the
744  single block */
745  /* TODO: Should this be acquire or release? */
746  if (team->t.t_construct == old_this) {
747  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
748  th->th.th_local.this_construct);
749  }
750 #if USE_ITT_BUILD
751  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
752  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
753  team->t.t_active_level ==
754  1) { // Only report metadata by master of active team at level 1
755  __kmp_itt_metadata_single(id_ref);
756  }
757 #endif /* USE_ITT_BUILD */
758  }
759 
760  if (__kmp_env_consistency_check) {
761  if (status && push_ws) {
762  __kmp_push_workshare(gtid, ct_psingle, id_ref);
763  } else {
764  __kmp_check_workshare(gtid, ct_psingle, id_ref);
765  }
766  }
767 #if USE_ITT_BUILD
768  if (status) {
769  __kmp_itt_single_start(gtid);
770  }
771 #endif /* USE_ITT_BUILD */
772  return status;
773 }
774 
775 void __kmp_exit_single(int gtid) {
776 #if USE_ITT_BUILD
777  __kmp_itt_single_end(gtid);
778 #endif /* USE_ITT_BUILD */
779  if (__kmp_env_consistency_check)
780  __kmp_pop_workshare(gtid, ct_psingle, NULL);
781 }
782 
783 /* determine if we can go parallel or must use a serialized parallel region and
784  * how many threads we can use
785  * set_nproc is the number of threads requested for the team
786  * returns 0 if we should serialize or only use one thread,
787  * otherwise the number of threads to use
788  * The forkjoin lock is held by the caller. */
789 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
790  int master_tid, int set_nthreads,
791  int enter_teams) {
792  int capacity;
793  int new_nthreads;
794  KMP_DEBUG_ASSERT(__kmp_init_serial);
795  KMP_DEBUG_ASSERT(root && parent_team);
796  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
797 
798  // If dyn-var is set, dynamically adjust the number of desired threads,
799  // according to the method specified by dynamic_mode.
800  new_nthreads = set_nthreads;
801  if (!get__dynamic_2(parent_team, master_tid)) {
802  ;
803  }
804 #ifdef USE_LOAD_BALANCE
805  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
806  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
807  if (new_nthreads == 1) {
808  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
809  "reservation to 1 thread\n",
810  master_tid));
811  return 1;
812  }
813  if (new_nthreads < set_nthreads) {
814  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
815  "reservation to %d threads\n",
816  master_tid, new_nthreads));
817  }
818  }
819 #endif /* USE_LOAD_BALANCE */
820  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
821  new_nthreads = __kmp_avail_proc - __kmp_nth +
822  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
823  if (new_nthreads <= 1) {
824  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
825  "reservation to 1 thread\n",
826  master_tid));
827  return 1;
828  }
829  if (new_nthreads < set_nthreads) {
830  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
831  "reservation to %d threads\n",
832  master_tid, new_nthreads));
833  } else {
834  new_nthreads = set_nthreads;
835  }
836  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
837  if (set_nthreads > 2) {
838  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
839  new_nthreads = (new_nthreads % set_nthreads) + 1;
840  if (new_nthreads == 1) {
841  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
842  "reservation to 1 thread\n",
843  master_tid));
844  return 1;
845  }
846  if (new_nthreads < set_nthreads) {
847  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
848  "reservation to %d threads\n",
849  master_tid, new_nthreads));
850  }
851  }
852  } else {
853  KMP_ASSERT(0);
854  }
855 
856  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
857  if (__kmp_nth + new_nthreads -
858  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
859  __kmp_max_nth) {
860  int tl_nthreads = __kmp_max_nth - __kmp_nth +
861  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
862  if (tl_nthreads <= 0) {
863  tl_nthreads = 1;
864  }
865 
866  // If dyn-var is false, emit a 1-time warning.
867  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
868  __kmp_reserve_warn = 1;
869  __kmp_msg(kmp_ms_warning,
870  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
871  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
872  }
873  if (tl_nthreads == 1) {
874  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
875  "reduced reservation to 1 thread\n",
876  master_tid));
877  return 1;
878  }
879  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
880  "reservation to %d threads\n",
881  master_tid, tl_nthreads));
882  new_nthreads = tl_nthreads;
883  }
884 
885  // Respect OMP_THREAD_LIMIT
886  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
887  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
888  if (cg_nthreads + new_nthreads -
889  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
890  max_cg_threads) {
891  int tl_nthreads = max_cg_threads - cg_nthreads +
892  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
893  if (tl_nthreads <= 0) {
894  tl_nthreads = 1;
895  }
896 
897  // If dyn-var is false, emit a 1-time warning.
898  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
899  __kmp_reserve_warn = 1;
900  __kmp_msg(kmp_ms_warning,
901  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
902  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
903  }
904  if (tl_nthreads == 1) {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
906  "reduced reservation to 1 thread\n",
907  master_tid));
908  return 1;
909  }
910  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
911  "reservation to %d threads\n",
912  master_tid, tl_nthreads));
913  new_nthreads = tl_nthreads;
914  }
915 
916  // Check if the threads array is large enough, or needs expanding.
917  // See comment in __kmp_register_root() about the adjustment if
918  // __kmp_threads[0] == NULL.
919  capacity = __kmp_threads_capacity;
920  if (TCR_PTR(__kmp_threads[0]) == NULL) {
921  --capacity;
922  }
923  if (__kmp_nth + new_nthreads -
924  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
925  capacity) {
926  // Expand the threads array.
927  int slotsRequired = __kmp_nth + new_nthreads -
928  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
929  capacity;
930  int slotsAdded = __kmp_expand_threads(slotsRequired);
931  if (slotsAdded < slotsRequired) {
932  // The threads array was not expanded enough.
933  new_nthreads -= (slotsRequired - slotsAdded);
934  KMP_ASSERT(new_nthreads >= 1);
935 
936  // If dyn-var is false, emit a 1-time warning.
937  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
938  __kmp_reserve_warn = 1;
939  if (__kmp_tp_cached) {
940  __kmp_msg(kmp_ms_warning,
941  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
943  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
944  } else {
945  __kmp_msg(kmp_ms_warning,
946  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
947  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
948  }
949  }
950  }
951  }
952 
953 #ifdef KMP_DEBUG
954  if (new_nthreads == 1) {
955  KC_TRACE(10,
956  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
957  "dead roots and rechecking; requested %d threads\n",
958  __kmp_get_gtid(), set_nthreads));
959  } else {
960  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
961  " %d threads\n",
962  __kmp_get_gtid(), new_nthreads, set_nthreads));
963  }
964 #endif // KMP_DEBUG
965  return new_nthreads;
966 }
967 
968 /* Allocate threads from the thread pool and assign them to the new team. We are
969  assured that there are enough threads available, because we checked on that
970  earlier within critical section forkjoin */
971 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
972  kmp_info_t *master_th, int master_gtid) {
973  int i;
974  int use_hot_team;
975 
976  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
977  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
978  KMP_MB();
979 
980  /* first, let's setup the master thread */
981  master_th->th.th_info.ds.ds_tid = 0;
982  master_th->th.th_team = team;
983  master_th->th.th_team_nproc = team->t.t_nproc;
984  master_th->th.th_team_master = master_th;
985  master_th->th.th_team_serialized = FALSE;
986  master_th->th.th_dispatch = &team->t.t_dispatch[0];
987 
988 /* make sure we are not the optimized hot team */
989 #if KMP_NESTED_HOT_TEAMS
990  use_hot_team = 0;
991  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
992  if (hot_teams) { // hot teams array is not allocated if
993  // KMP_HOT_TEAMS_MAX_LEVEL=0
994  int level = team->t.t_active_level - 1; // index in array of hot teams
995  if (master_th->th.th_teams_microtask) { // are we inside the teams?
996  if (master_th->th.th_teams_size.nteams > 1) {
997  ++level; // level was not increased in teams construct for
998  // team_of_masters
999  }
1000  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1001  master_th->th.th_teams_level == team->t.t_level) {
1002  ++level; // level was not increased in teams construct for
1003  // team_of_workers before the parallel
1004  } // team->t.t_level will be increased inside parallel
1005  }
1006  if (level < __kmp_hot_teams_max_level) {
1007  if (hot_teams[level].hot_team) {
1008  // hot team has already been allocated for given level
1009  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1010  use_hot_team = 1; // the team is ready to use
1011  } else {
1012  use_hot_team = 0; // AC: threads are not allocated yet
1013  hot_teams[level].hot_team = team; // remember new hot team
1014  hot_teams[level].hot_team_nth = team->t.t_nproc;
1015  }
1016  } else {
1017  use_hot_team = 0;
1018  }
1019  }
1020 #else
1021  use_hot_team = team == root->r.r_hot_team;
1022 #endif
1023  if (!use_hot_team) {
1024 
1025  /* install the master thread */
1026  team->t.t_threads[0] = master_th;
1027  __kmp_initialize_info(master_th, team, 0, master_gtid);
1028 
1029  /* now, install the worker threads */
1030  for (i = 1; i < team->t.t_nproc; i++) {
1031 
1032  /* fork or reallocate a new thread and install it in team */
1033  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1034  team->t.t_threads[i] = thr;
1035  KMP_DEBUG_ASSERT(thr);
1036  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1037  /* align team and thread arrived states */
1038  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1039  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1040  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1041  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1042  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1043  team->t.t_bar[bs_plain_barrier].b_arrived));
1044  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1045  thr->th.th_teams_level = master_th->th.th_teams_level;
1046  thr->th.th_teams_size = master_th->th.th_teams_size;
1047  { // Initialize threads' barrier data.
1048  int b;
1049  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1050  for (b = 0; b < bs_last_barrier; ++b) {
1051  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1052  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1053 #if USE_DEBUGGER
1054  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1055 #endif
1056  }
1057  }
1058  }
1059 
1060 #if KMP_AFFINITY_SUPPORTED
1061  __kmp_partition_places(team);
1062 #endif
1063  }
1064 
1065  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1066  for (i = 0; i < team->t.t_nproc; i++) {
1067  kmp_info_t *thr = team->t.t_threads[i];
1068  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1069  thr->th.th_prev_level != team->t.t_level) {
1070  team->t.t_display_affinity = 1;
1071  break;
1072  }
1073  }
1074  }
1075 
1076  KMP_MB();
1077 }
1078 
1079 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1080 // Propagate any changes to the floating point control registers out to the team
1081 // We try to avoid unnecessary writes to the relevant cache line in the team
1082 // structure, so we don't make changes unless they are needed.
1083 inline static void propagateFPControl(kmp_team_t *team) {
1084  if (__kmp_inherit_fp_control) {
1085  kmp_int16 x87_fpu_control_word;
1086  kmp_uint32 mxcsr;
1087 
1088  // Get master values of FPU control flags (both X87 and vector)
1089  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1090  __kmp_store_mxcsr(&mxcsr);
1091  mxcsr &= KMP_X86_MXCSR_MASK;
1092 
1093  // There is no point looking at t_fp_control_saved here.
1094  // If it is TRUE, we still have to update the values if they are different
1095  // from those we now have. If it is FALSE we didn't save anything yet, but
1096  // our objective is the same. We have to ensure that the values in the team
1097  // are the same as those we have.
1098  // So, this code achieves what we need whether or not t_fp_control_saved is
1099  // true. By checking whether the value needs updating we avoid unnecessary
1100  // writes that would put the cache-line into a written state, causing all
1101  // threads in the team to have to read it again.
1102  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1103  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1104  // Although we don't use this value, other code in the runtime wants to know
1105  // whether it should restore them. So we must ensure it is correct.
1106  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1107  } else {
1108  // Similarly here. Don't write to this cache-line in the team structure
1109  // unless we have to.
1110  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1111  }
1112 }
1113 
1114 // Do the opposite, setting the hardware registers to the updated values from
1115 // the team.
1116 inline static void updateHWFPControl(kmp_team_t *team) {
1117  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1118  // Only reset the fp control regs if they have been changed in the team.
1119  // the parallel region that we are exiting.
1120  kmp_int16 x87_fpu_control_word;
1121  kmp_uint32 mxcsr;
1122  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1123  __kmp_store_mxcsr(&mxcsr);
1124  mxcsr &= KMP_X86_MXCSR_MASK;
1125 
1126  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1127  __kmp_clear_x87_fpu_status_word();
1128  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1129  }
1130 
1131  if (team->t.t_mxcsr != mxcsr) {
1132  __kmp_load_mxcsr(&team->t.t_mxcsr);
1133  }
1134  }
1135 }
1136 #else
1137 #define propagateFPControl(x) ((void)0)
1138 #define updateHWFPControl(x) ((void)0)
1139 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1140 
1141 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1142  int realloc); // forward declaration
1143 
1144 /* Run a parallel region that has been serialized, so runs only in a team of the
1145  single master thread. */
1146 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1147  kmp_info_t *this_thr;
1148  kmp_team_t *serial_team;
1149 
1150  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1151 
1152  /* Skip all this code for autopar serialized loops since it results in
1153  unacceptable overhead */
1154  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1155  return;
1156 
1157  if (!TCR_4(__kmp_init_parallel))
1158  __kmp_parallel_initialize();
1159  __kmp_resume_if_soft_paused();
1160 
1161  this_thr = __kmp_threads[global_tid];
1162  serial_team = this_thr->th.th_serial_team;
1163 
1164  /* utilize the serialized team held by this thread */
1165  KMP_DEBUG_ASSERT(serial_team);
1166  KMP_MB();
1167 
1168  if (__kmp_tasking_mode != tskm_immediate_exec) {
1169  KMP_DEBUG_ASSERT(
1170  this_thr->th.th_task_team ==
1171  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1172  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1173  NULL);
1174  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1175  "team %p, new task_team = NULL\n",
1176  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1177  this_thr->th.th_task_team = NULL;
1178  }
1179 
1180  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1181  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1182  proc_bind = proc_bind_false;
1183  } else if (proc_bind == proc_bind_default) {
1184  // No proc_bind clause was specified, so use the current value
1185  // of proc-bind-var for this parallel region.
1186  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1187  }
1188  // Reset for next parallel region
1189  this_thr->th.th_set_proc_bind = proc_bind_default;
1190 
1191 #if OMPT_SUPPORT
1192  ompt_data_t ompt_parallel_data = ompt_data_none;
1193  ompt_data_t *implicit_task_data;
1194  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1195  if (ompt_enabled.enabled &&
1196  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1197 
1198  ompt_task_info_t *parent_task_info;
1199  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1200 
1201  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1202  if (ompt_enabled.ompt_callback_parallel_begin) {
1203  int team_size = 1;
1204 
1205  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1206  &(parent_task_info->task_data), &(parent_task_info->frame),
1207  &ompt_parallel_data, team_size,
1208  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1209  }
1210  }
1211 #endif // OMPT_SUPPORT
1212 
1213  if (this_thr->th.th_team != serial_team) {
1214  // Nested level will be an index in the nested nthreads array
1215  int level = this_thr->th.th_team->t.t_level;
1216 
1217  if (serial_team->t.t_serialized) {
1218  /* this serial team was already used
1219  TODO increase performance by making this locks more specific */
1220  kmp_team_t *new_team;
1221 
1222  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1223 
1224  new_team =
1225  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1226 #if OMPT_SUPPORT
1227  ompt_parallel_data,
1228 #endif
1229  proc_bind, &this_thr->th.th_current_task->td_icvs,
1230  0 USE_NESTED_HOT_ARG(NULL));
1231  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1232  KMP_ASSERT(new_team);
1233 
1234  /* setup new serialized team and install it */
1235  new_team->t.t_threads[0] = this_thr;
1236  new_team->t.t_parent = this_thr->th.th_team;
1237  serial_team = new_team;
1238  this_thr->th.th_serial_team = serial_team;
1239 
1240  KF_TRACE(
1241  10,
1242  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1243  global_tid, serial_team));
1244 
1245  /* TODO the above breaks the requirement that if we run out of resources,
1246  then we can still guarantee that serialized teams are ok, since we may
1247  need to allocate a new one */
1248  } else {
1249  KF_TRACE(
1250  10,
1251  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1252  global_tid, serial_team));
1253  }
1254 
1255  /* we have to initialize this serial team */
1256  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1257  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1258  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1259  serial_team->t.t_ident = loc;
1260  serial_team->t.t_serialized = 1;
1261  serial_team->t.t_nproc = 1;
1262  serial_team->t.t_parent = this_thr->th.th_team;
1263  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1264  this_thr->th.th_team = serial_team;
1265  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1266 
1267  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1268  this_thr->th.th_current_task));
1269  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1270  this_thr->th.th_current_task->td_flags.executing = 0;
1271 
1272  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1273 
1274  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1275  implicit task for each serialized task represented by
1276  team->t.t_serialized? */
1277  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1278  &this_thr->th.th_current_task->td_parent->td_icvs);
1279 
1280  // Thread value exists in the nested nthreads array for the next nested
1281  // level
1282  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1283  this_thr->th.th_current_task->td_icvs.nproc =
1284  __kmp_nested_nth.nth[level + 1];
1285  }
1286 
1287  if (__kmp_nested_proc_bind.used &&
1288  (level + 1 < __kmp_nested_proc_bind.used)) {
1289  this_thr->th.th_current_task->td_icvs.proc_bind =
1290  __kmp_nested_proc_bind.bind_types[level + 1];
1291  }
1292 
1293 #if USE_DEBUGGER
1294  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1295 #endif
1296  this_thr->th.th_info.ds.ds_tid = 0;
1297 
1298  /* set thread cache values */
1299  this_thr->th.th_team_nproc = 1;
1300  this_thr->th.th_team_master = this_thr;
1301  this_thr->th.th_team_serialized = 1;
1302 
1303  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1304  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1305  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1306 
1307  propagateFPControl(serial_team);
1308 
1309  /* check if we need to allocate dispatch buffers stack */
1310  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1311  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1312  serial_team->t.t_dispatch->th_disp_buffer =
1313  (dispatch_private_info_t *)__kmp_allocate(
1314  sizeof(dispatch_private_info_t));
1315  }
1316  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1317 
1318  KMP_MB();
1319 
1320  } else {
1321  /* this serialized team is already being used,
1322  * that's fine, just add another nested level */
1323  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1324  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1325  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1326  ++serial_team->t.t_serialized;
1327  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1328 
1329  // Nested level will be an index in the nested nthreads array
1330  int level = this_thr->th.th_team->t.t_level;
1331  // Thread value exists in the nested nthreads array for the next nested
1332  // level
1333  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1334  this_thr->th.th_current_task->td_icvs.nproc =
1335  __kmp_nested_nth.nth[level + 1];
1336  }
1337  serial_team->t.t_level++;
1338  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1339  "of serial team %p to %d\n",
1340  global_tid, serial_team, serial_team->t.t_level));
1341 
1342  /* allocate/push dispatch buffers stack */
1343  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1344  {
1345  dispatch_private_info_t *disp_buffer =
1346  (dispatch_private_info_t *)__kmp_allocate(
1347  sizeof(dispatch_private_info_t));
1348  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1349  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1350  }
1351  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352 
1353  KMP_MB();
1354  }
1355  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1356 
1357  // Perform the display affinity functionality for
1358  // serialized parallel regions
1359  if (__kmp_display_affinity) {
1360  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1361  this_thr->th.th_prev_num_threads != 1) {
1362  // NULL means use the affinity-format-var ICV
1363  __kmp_aux_display_affinity(global_tid, NULL);
1364  this_thr->th.th_prev_level = serial_team->t.t_level;
1365  this_thr->th.th_prev_num_threads = 1;
1366  }
1367  }
1368 
1369  if (__kmp_env_consistency_check)
1370  __kmp_push_parallel(global_tid, NULL);
1371 #if OMPT_SUPPORT
1372  serial_team->t.ompt_team_info.master_return_address = codeptr;
1373  if (ompt_enabled.enabled &&
1374  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1375  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1376 
1377  ompt_lw_taskteam_t lw_taskteam;
1378  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1379  &ompt_parallel_data, codeptr);
1380 
1381  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1382  // don't use lw_taskteam after linking. content was swaped
1383 
1384  /* OMPT implicit task begin */
1385  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1386  if (ompt_enabled.ompt_callback_implicit_task) {
1387  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1388  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1389  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1390  OMPT_CUR_TASK_INFO(this_thr)
1391  ->thread_num = __kmp_tid_from_gtid(global_tid);
1392  }
1393 
1394  /* OMPT state */
1395  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1396  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1397  }
1398 #endif
1399 }
1400 
1401 /* most of the work for a fork */
1402 /* return true if we really went parallel, false if serialized */
1403 int __kmp_fork_call(ident_t *loc, int gtid,
1404  enum fork_context_e call_context, // Intel, GNU, ...
1405  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1406  kmp_va_list ap) {
1407  void **argv;
1408  int i;
1409  int master_tid;
1410  int master_this_cons;
1411  kmp_team_t *team;
1412  kmp_team_t *parent_team;
1413  kmp_info_t *master_th;
1414  kmp_root_t *root;
1415  int nthreads;
1416  int master_active;
1417  int master_set_numthreads;
1418  int level;
1419  int active_level;
1420  int teams_level;
1421 #if KMP_NESTED_HOT_TEAMS
1422  kmp_hot_team_ptr_t **p_hot_teams;
1423 #endif
1424  { // KMP_TIME_BLOCK
1425  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1426  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1427 
1428  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1429  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1430  /* Some systems prefer the stack for the root thread(s) to start with */
1431  /* some gap from the parent stack to prevent false sharing. */
1432  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1433  /* These 2 lines below are so this does not get optimized out */
1434  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1435  __kmp_stkpadding += (short)((kmp_int64)dummy);
1436  }
1437 
1438  /* initialize if needed */
1439  KMP_DEBUG_ASSERT(
1440  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1441  if (!TCR_4(__kmp_init_parallel))
1442  __kmp_parallel_initialize();
1443  __kmp_resume_if_soft_paused();
1444 
1445  /* setup current data */
1446  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1447  // shutdown
1448  parent_team = master_th->th.th_team;
1449  master_tid = master_th->th.th_info.ds.ds_tid;
1450  master_this_cons = master_th->th.th_local.this_construct;
1451  root = master_th->th.th_root;
1452  master_active = root->r.r_active;
1453  master_set_numthreads = master_th->th.th_set_nproc;
1454 
1455 #if OMPT_SUPPORT
1456  ompt_data_t ompt_parallel_data = ompt_data_none;
1457  ompt_data_t *parent_task_data;
1458  ompt_frame_t *ompt_frame;
1459  ompt_data_t *implicit_task_data;
1460  void *return_address = NULL;
1461 
1462  if (ompt_enabled.enabled) {
1463  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1464  NULL, NULL);
1465  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1466  }
1467 #endif
1468 
1469  // Nested level will be an index in the nested nthreads array
1470  level = parent_team->t.t_level;
1471  // used to launch non-serial teams even if nested is not allowed
1472  active_level = parent_team->t.t_active_level;
1473  // needed to check nesting inside the teams
1474  teams_level = master_th->th.th_teams_level;
1475 #if KMP_NESTED_HOT_TEAMS
1476  p_hot_teams = &master_th->th.th_hot_teams;
1477  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1478  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1479  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1480  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1481  // it is either actual or not needed (when active_level > 0)
1482  (*p_hot_teams)[0].hot_team_nth = 1;
1483  }
1484 #endif
1485 
1486 #if OMPT_SUPPORT
1487  if (ompt_enabled.enabled) {
1488  if (ompt_enabled.ompt_callback_parallel_begin) {
1489  int team_size = master_set_numthreads
1490  ? master_set_numthreads
1491  : get__nproc_2(parent_team, master_tid);
1492  int flags = OMPT_INVOKER(call_context) |
1493  ((microtask == (microtask_t)__kmp_teams_master)
1494  ? ompt_parallel_league
1495  : ompt_parallel_team);
1496  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1497  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1498  return_address);
1499  }
1500  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1501  }
1502 #endif
1503 
1504  master_th->th.th_ident = loc;
1505 
1506  if (master_th->th.th_teams_microtask && ap &&
1507  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1508  // AC: This is start of parallel that is nested inside teams construct.
1509  // The team is actual (hot), all workers are ready at the fork barrier.
1510  // No lock needed to initialize the team a bit, then free workers.
1511  parent_team->t.t_ident = loc;
1512  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1513  parent_team->t.t_argc = argc;
1514  argv = (void **)parent_team->t.t_argv;
1515  for (i = argc - 1; i >= 0; --i)
1516  *argv++ = va_arg(kmp_va_deref(ap), void *);
1517  // Increment our nested depth levels, but not increase the serialization
1518  if (parent_team == master_th->th.th_serial_team) {
1519  // AC: we are in serialized parallel
1520  __kmpc_serialized_parallel(loc, gtid);
1521  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1522 
1523  if (call_context == fork_context_gnu) {
1524  // AC: need to decrement t_serialized for enquiry functions to work
1525  // correctly, will restore at join time
1526  parent_team->t.t_serialized--;
1527  return TRUE;
1528  }
1529 
1530 #if OMPT_SUPPORT
1531  void *dummy;
1532  void **exit_frame_p;
1533 
1534  ompt_lw_taskteam_t lw_taskteam;
1535 
1536  if (ompt_enabled.enabled) {
1537  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1538  &ompt_parallel_data, return_address);
1539  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1540 
1541  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1542  // don't use lw_taskteam after linking. content was swaped
1543 
1544  /* OMPT implicit task begin */
1545  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1546  if (ompt_enabled.ompt_callback_implicit_task) {
1547  OMPT_CUR_TASK_INFO(master_th)
1548  ->thread_num = __kmp_tid_from_gtid(gtid);
1549  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1550  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1551  implicit_task_data, 1,
1552  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553  }
1554 
1555  /* OMPT state */
1556  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1557  } else {
1558  exit_frame_p = &dummy;
1559  }
1560 #endif
1561  // AC: need to decrement t_serialized for enquiry functions to work
1562  // correctly, will restore at join time
1563  parent_team->t.t_serialized--;
1564 
1565  {
1566  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1567  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1568  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1569 #if OMPT_SUPPORT
1570  ,
1571  exit_frame_p
1572 #endif
1573  );
1574  }
1575 
1576 #if OMPT_SUPPORT
1577  if (ompt_enabled.enabled) {
1578  *exit_frame_p = NULL;
1579  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1580  if (ompt_enabled.ompt_callback_implicit_task) {
1581  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1582  ompt_scope_end, NULL, implicit_task_data, 1,
1583  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1584  }
1585  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1586  __ompt_lw_taskteam_unlink(master_th);
1587  if (ompt_enabled.ompt_callback_parallel_end) {
1588  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1589  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1590  OMPT_INVOKER(call_context) | ompt_parallel_team,
1591  return_address);
1592  }
1593  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1594  }
1595 #endif
1596  return TRUE;
1597  }
1598 
1599  parent_team->t.t_pkfn = microtask;
1600  parent_team->t.t_invoke = invoker;
1601  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1602  parent_team->t.t_active_level++;
1603  parent_team->t.t_level++;
1604  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1605 
1606 #if OMPT_SUPPORT
1607  if (ompt_enabled.enabled) {
1608  ompt_lw_taskteam_t lw_taskteam;
1609  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1610  &ompt_parallel_data, return_address);
1611  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1612  }
1613 #endif
1614 
1615  /* Change number of threads in the team if requested */
1616  if (master_set_numthreads) { // The parallel has num_threads clause
1617  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1618  // AC: only can reduce number of threads dynamically, can't increase
1619  kmp_info_t **other_threads = parent_team->t.t_threads;
1620  parent_team->t.t_nproc = master_set_numthreads;
1621  for (i = 0; i < master_set_numthreads; ++i) {
1622  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1623  }
1624  // Keep extra threads hot in the team for possible next parallels
1625  }
1626  master_th->th.th_set_nproc = 0;
1627  }
1628 
1629 #if USE_DEBUGGER
1630  if (__kmp_debugging) { // Let debugger override number of threads.
1631  int nth = __kmp_omp_num_threads(loc);
1632  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1633  master_set_numthreads = nth;
1634  }
1635  }
1636 #endif
1637 
1638 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1639  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1640  KMP_ITT_DEBUG) &&
1641  __kmp_forkjoin_frames_mode == 3 &&
1642  parent_team->t.t_active_level == 1 // only report frames at level 1
1643  && master_th->th.th_teams_size.nteams == 1) {
1644  kmp_uint64 tmp_time = __itt_get_timestamp();
1645  master_th->th.th_frame_time = tmp_time;
1646  parent_team->t.t_region_time = tmp_time;
1647  }
1648  if (__itt_stack_caller_create_ptr) {
1649  // create new stack stitching id before entering fork barrier
1650  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1651  }
1652 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1653 
1654  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1655  "master_th=%p, gtid=%d\n",
1656  root, parent_team, master_th, gtid));
1657  __kmp_internal_fork(loc, gtid, parent_team);
1658  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1659  "master_th=%p, gtid=%d\n",
1660  root, parent_team, master_th, gtid));
1661 
1662  if (call_context == fork_context_gnu)
1663  return TRUE;
1664 
1665  /* Invoke microtask for MASTER thread */
1666  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1667  parent_team->t.t_id, parent_team->t.t_pkfn));
1668 
1669  if (!parent_team->t.t_invoke(gtid)) {
1670  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1671  }
1672  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1673  parent_team->t.t_id, parent_team->t.t_pkfn));
1674  KMP_MB(); /* Flush all pending memory write invalidates. */
1675 
1676  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1677 
1678  return TRUE;
1679  } // Parallel closely nested in teams construct
1680 
1681 #if KMP_DEBUG
1682  if (__kmp_tasking_mode != tskm_immediate_exec) {
1683  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1684  parent_team->t.t_task_team[master_th->th.th_task_state]);
1685  }
1686 #endif
1687 
1688  if (parent_team->t.t_active_level >=
1689  master_th->th.th_current_task->td_icvs.max_active_levels) {
1690  nthreads = 1;
1691  } else {
1692  int enter_teams = ((ap == NULL && active_level == 0) ||
1693  (ap && teams_level > 0 && teams_level == level));
1694  nthreads =
1695  master_set_numthreads
1696  ? master_set_numthreads
1697  : get__nproc_2(
1698  parent_team,
1699  master_tid); // TODO: get nproc directly from current task
1700 
1701  // Check if we need to take forkjoin lock? (no need for serialized
1702  // parallel out of teams construct). This code moved here from
1703  // __kmp_reserve_threads() to speedup nested serialized parallels.
1704  if (nthreads > 1) {
1705  if ((get__max_active_levels(master_th) == 1 &&
1706  (root->r.r_in_parallel && !enter_teams)) ||
1707  (__kmp_library == library_serial)) {
1708  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1709  " threads\n",
1710  gtid, nthreads));
1711  nthreads = 1;
1712  }
1713  }
1714  if (nthreads > 1) {
1715  /* determine how many new threads we can use */
1716  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1717  /* AC: If we execute teams from parallel region (on host), then teams
1718  should be created but each can only have 1 thread if nesting is
1719  disabled. If teams called from serial region, then teams and their
1720  threads should be created regardless of the nesting setting. */
1721  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1722  nthreads, enter_teams);
1723  if (nthreads == 1) {
1724  // Free lock for single thread execution here; for multi-thread
1725  // execution it will be freed later after team of threads created
1726  // and initialized
1727  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1728  }
1729  }
1730  }
1731  KMP_DEBUG_ASSERT(nthreads > 0);
1732 
1733  // If we temporarily changed the set number of threads then restore it now
1734  master_th->th.th_set_nproc = 0;
1735 
1736  /* create a serialized parallel region? */
1737  if (nthreads == 1) {
1738 /* josh todo: hypothetical question: what do we do for OS X*? */
1739 #if KMP_OS_LINUX && \
1740  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1741  void *args[argc];
1742 #else
1743  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1744 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1745  KMP_ARCH_AARCH64) */
1746 
1747  KA_TRACE(20,
1748  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1749 
1750  __kmpc_serialized_parallel(loc, gtid);
1751 
1752  if (call_context == fork_context_intel) {
1753  /* TODO this sucks, use the compiler itself to pass args! :) */
1754  master_th->th.th_serial_team->t.t_ident = loc;
1755  if (!ap) {
1756  // revert change made in __kmpc_serialized_parallel()
1757  master_th->th.th_serial_team->t.t_level--;
1758 // Get args from parent team for teams construct
1759 
1760 #if OMPT_SUPPORT
1761  void *dummy;
1762  void **exit_frame_p;
1763  ompt_task_info_t *task_info;
1764 
1765  ompt_lw_taskteam_t lw_taskteam;
1766 
1767  if (ompt_enabled.enabled) {
1768  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1769  &ompt_parallel_data, return_address);
1770 
1771  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1772  // don't use lw_taskteam after linking. content was swaped
1773 
1774  task_info = OMPT_CUR_TASK_INFO(master_th);
1775  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1776  if (ompt_enabled.ompt_callback_implicit_task) {
1777  OMPT_CUR_TASK_INFO(master_th)
1778  ->thread_num = __kmp_tid_from_gtid(gtid);
1779  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1780  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1781  &(task_info->task_data), 1,
1782  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1783  ompt_task_implicit);
1784  }
1785 
1786  /* OMPT state */
1787  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1788  } else {
1789  exit_frame_p = &dummy;
1790  }
1791 #endif
1792 
1793  {
1794  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1795  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1796  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1797  parent_team->t.t_argv
1798 #if OMPT_SUPPORT
1799  ,
1800  exit_frame_p
1801 #endif
1802  );
1803  }
1804 
1805 #if OMPT_SUPPORT
1806  if (ompt_enabled.enabled) {
1807  *exit_frame_p = NULL;
1808  if (ompt_enabled.ompt_callback_implicit_task) {
1809  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1810  ompt_scope_end, NULL, &(task_info->task_data), 1,
1811  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1812  ompt_task_implicit);
1813  }
1814  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1815  __ompt_lw_taskteam_unlink(master_th);
1816  if (ompt_enabled.ompt_callback_parallel_end) {
1817  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1818  &ompt_parallel_data, parent_task_data,
1819  OMPT_INVOKER(call_context) | ompt_parallel_team,
1820  return_address);
1821  }
1822  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1823  }
1824 #endif
1825  } else if (microtask == (microtask_t)__kmp_teams_master) {
1826  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1827  master_th->th.th_serial_team);
1828  team = master_th->th.th_team;
1829  // team->t.t_pkfn = microtask;
1830  team->t.t_invoke = invoker;
1831  __kmp_alloc_argv_entries(argc, team, TRUE);
1832  team->t.t_argc = argc;
1833  argv = (void **)team->t.t_argv;
1834  if (ap) {
1835  for (i = argc - 1; i >= 0; --i)
1836  *argv++ = va_arg(kmp_va_deref(ap), void *);
1837  } else {
1838  for (i = 0; i < argc; ++i)
1839  // Get args from parent team for teams construct
1840  argv[i] = parent_team->t.t_argv[i];
1841  }
1842  // AC: revert change made in __kmpc_serialized_parallel()
1843  // because initial code in teams should have level=0
1844  team->t.t_level--;
1845  // AC: call special invoker for outer "parallel" of teams construct
1846  invoker(gtid);
1847 #if OMPT_SUPPORT
1848  if (ompt_enabled.enabled) {
1849  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1850  if (ompt_enabled.ompt_callback_implicit_task) {
1851  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1852  ompt_scope_end, NULL, &(task_info->task_data), 0,
1853  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1854  }
1855  if (ompt_enabled.ompt_callback_parallel_end) {
1856  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1857  &ompt_parallel_data, parent_task_data,
1858  OMPT_INVOKER(call_context) | ompt_parallel_league,
1859  return_address);
1860  }
1861  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1862  }
1863 #endif
1864  } else {
1865  argv = args;
1866  for (i = argc - 1; i >= 0; --i)
1867  *argv++ = va_arg(kmp_va_deref(ap), void *);
1868  KMP_MB();
1869 
1870 #if OMPT_SUPPORT
1871  void *dummy;
1872  void **exit_frame_p;
1873  ompt_task_info_t *task_info;
1874 
1875  ompt_lw_taskteam_t lw_taskteam;
1876 
1877  if (ompt_enabled.enabled) {
1878  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1879  &ompt_parallel_data, return_address);
1880  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1881  // don't use lw_taskteam after linking. content was swaped
1882  task_info = OMPT_CUR_TASK_INFO(master_th);
1883  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1884 
1885  /* OMPT implicit task begin */
1886  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1887  if (ompt_enabled.ompt_callback_implicit_task) {
1888  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1889  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1890  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1891  ompt_task_implicit);
1892  OMPT_CUR_TASK_INFO(master_th)
1893  ->thread_num = __kmp_tid_from_gtid(gtid);
1894  }
1895 
1896  /* OMPT state */
1897  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1898  } else {
1899  exit_frame_p = &dummy;
1900  }
1901 #endif
1902 
1903  {
1904  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1905  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1906  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1907 #if OMPT_SUPPORT
1908  ,
1909  exit_frame_p
1910 #endif
1911  );
1912  }
1913 
1914 #if OMPT_SUPPORT
1915  if (ompt_enabled.enabled) {
1916  *exit_frame_p = NULL;
1917  if (ompt_enabled.ompt_callback_implicit_task) {
1918  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1919  ompt_scope_end, NULL, &(task_info->task_data), 1,
1920  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1921  ompt_task_implicit);
1922  }
1923 
1924  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1925  __ompt_lw_taskteam_unlink(master_th);
1926  if (ompt_enabled.ompt_callback_parallel_end) {
1927  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1928  &ompt_parallel_data, parent_task_data,
1929  OMPT_INVOKER(call_context) | ompt_parallel_team,
1930  return_address);
1931  }
1932  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1933  }
1934 #endif
1935  }
1936  } else if (call_context == fork_context_gnu) {
1937 #if OMPT_SUPPORT
1938  ompt_lw_taskteam_t lwt;
1939  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1940  return_address);
1941 
1942  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1943  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1944 // don't use lw_taskteam after linking. content was swaped
1945 #endif
1946 
1947  // we were called from GNU native code
1948  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1949  return FALSE;
1950  } else {
1951  KMP_ASSERT2(call_context < fork_context_last,
1952  "__kmp_fork_call: unknown fork_context parameter");
1953  }
1954 
1955  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1956  KMP_MB();
1957  return FALSE;
1958  } // if (nthreads == 1)
1959 
1960  // GEH: only modify the executing flag in the case when not serialized
1961  // serialized case is handled in kmpc_serialized_parallel
1962  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1963  "curtask=%p, curtask_max_aclevel=%d\n",
1964  parent_team->t.t_active_level, master_th,
1965  master_th->th.th_current_task,
1966  master_th->th.th_current_task->td_icvs.max_active_levels));
1967  // TODO: GEH - cannot do this assertion because root thread not set up as
1968  // executing
1969  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1970  master_th->th.th_current_task->td_flags.executing = 0;
1971 
1972  if (!master_th->th.th_teams_microtask || level > teams_level) {
1973  /* Increment our nested depth level */
1974  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1975  }
1976 
1977  // See if we need to make a copy of the ICVs.
1978  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1979  if ((level + 1 < __kmp_nested_nth.used) &&
1980  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1981  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1982  } else {
1983  nthreads_icv = 0; // don't update
1984  }
1985 
1986  // Figure out the proc_bind_policy for the new team.
1987  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1988  kmp_proc_bind_t proc_bind_icv =
1989  proc_bind_default; // proc_bind_default means don't update
1990  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1991  proc_bind = proc_bind_false;
1992  } else {
1993  if (proc_bind == proc_bind_default) {
1994  // No proc_bind clause specified; use current proc-bind-var for this
1995  // parallel region
1996  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1997  }
1998  /* else: The proc_bind policy was specified explicitly on parallel clause.
1999  This overrides proc-bind-var for this parallel region, but does not
2000  change proc-bind-var. */
2001  // Figure the value of proc-bind-var for the child threads.
2002  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2003  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2004  master_th->th.th_current_task->td_icvs.proc_bind)) {
2005  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2006  }
2007  }
2008 
2009  // Reset for next parallel region
2010  master_th->th.th_set_proc_bind = proc_bind_default;
2011 
2012  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2013  kmp_internal_control_t new_icvs;
2014  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2015  new_icvs.next = NULL;
2016  if (nthreads_icv > 0) {
2017  new_icvs.nproc = nthreads_icv;
2018  }
2019  if (proc_bind_icv != proc_bind_default) {
2020  new_icvs.proc_bind = proc_bind_icv;
2021  }
2022 
2023  /* allocate a new parallel team */
2024  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2025  team = __kmp_allocate_team(root, nthreads, nthreads,
2026 #if OMPT_SUPPORT
2027  ompt_parallel_data,
2028 #endif
2029  proc_bind, &new_icvs,
2030  argc USE_NESTED_HOT_ARG(master_th));
2031  } else {
2032  /* allocate a new parallel team */
2033  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2034  team = __kmp_allocate_team(root, nthreads, nthreads,
2035 #if OMPT_SUPPORT
2036  ompt_parallel_data,
2037 #endif
2038  proc_bind,
2039  &master_th->th.th_current_task->td_icvs,
2040  argc USE_NESTED_HOT_ARG(master_th));
2041  }
2042  KF_TRACE(
2043  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2044 
2045  /* setup the new team */
2046  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2047  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2048  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2049  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2050  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2051 #if OMPT_SUPPORT
2052  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2053  return_address);
2054 #endif
2055  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2056  // TODO: parent_team->t.t_level == INT_MAX ???
2057  if (!master_th->th.th_teams_microtask || level > teams_level) {
2058  int new_level = parent_team->t.t_level + 1;
2059  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2060  new_level = parent_team->t.t_active_level + 1;
2061  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2062  } else {
2063  // AC: Do not increase parallel level at start of the teams construct
2064  int new_level = parent_team->t.t_level;
2065  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2066  new_level = parent_team->t.t_active_level;
2067  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2068  }
2069  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2070  // set master's schedule as new run-time schedule
2071  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2072 
2073  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2074  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2075 
2076  // Update the floating point rounding in the team if required.
2077  propagateFPControl(team);
2078 
2079  if (__kmp_tasking_mode != tskm_immediate_exec) {
2080  // Set master's task team to team's task team. Unless this is hot team, it
2081  // should be NULL.
2082  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2083  parent_team->t.t_task_team[master_th->th.th_task_state]);
2084  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2085  "%p, new task_team %p / team %p\n",
2086  __kmp_gtid_from_thread(master_th),
2087  master_th->th.th_task_team, parent_team,
2088  team->t.t_task_team[master_th->th.th_task_state], team));
2089 
2090  if (active_level || master_th->th.th_task_team) {
2091  // Take a memo of master's task_state
2092  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2093  if (master_th->th.th_task_state_top >=
2094  master_th->th.th_task_state_stack_sz) { // increase size
2095  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2096  kmp_uint8 *old_stack, *new_stack;
2097  kmp_uint32 i;
2098  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2099  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2100  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2101  }
2102  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2103  ++i) { // zero-init rest of stack
2104  new_stack[i] = 0;
2105  }
2106  old_stack = master_th->th.th_task_state_memo_stack;
2107  master_th->th.th_task_state_memo_stack = new_stack;
2108  master_th->th.th_task_state_stack_sz = new_size;
2109  __kmp_free(old_stack);
2110  }
2111  // Store master's task_state on stack
2112  master_th->th
2113  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2114  master_th->th.th_task_state;
2115  master_th->th.th_task_state_top++;
2116 #if KMP_NESTED_HOT_TEAMS
2117  if (master_th->th.th_hot_teams &&
2118  active_level < __kmp_hot_teams_max_level &&
2119  team == master_th->th.th_hot_teams[active_level].hot_team) {
2120  // Restore master's nested state if nested hot team
2121  master_th->th.th_task_state =
2122  master_th->th
2123  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2124  } else {
2125 #endif
2126  master_th->th.th_task_state = 0;
2127 #if KMP_NESTED_HOT_TEAMS
2128  }
2129 #endif
2130  }
2131 #if !KMP_NESTED_HOT_TEAMS
2132  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2133  (team == root->r.r_hot_team));
2134 #endif
2135  }
2136 
2137  KA_TRACE(
2138  20,
2139  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2140  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2141  team->t.t_nproc));
2142  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2143  (team->t.t_master_tid == 0 &&
2144  (team->t.t_parent == root->r.r_root_team ||
2145  team->t.t_parent->t.t_serialized)));
2146  KMP_MB();
2147 
2148  /* now, setup the arguments */
2149  argv = (void **)team->t.t_argv;
2150  if (ap) {
2151  for (i = argc - 1; i >= 0; --i) {
2152  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2153  KMP_CHECK_UPDATE(*argv, new_argv);
2154  argv++;
2155  }
2156  } else {
2157  for (i = 0; i < argc; ++i) {
2158  // Get args from parent team for teams construct
2159  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2160  }
2161  }
2162 
2163  /* now actually fork the threads */
2164  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2165  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2166  root->r.r_active = TRUE;
2167 
2168  __kmp_fork_team_threads(root, team, master_th, gtid);
2169  __kmp_setup_icv_copy(team, nthreads,
2170  &master_th->th.th_current_task->td_icvs, loc);
2171 
2172 #if OMPT_SUPPORT
2173  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2174 #endif
2175 
2176  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2177 
2178 #if USE_ITT_BUILD
2179  if (team->t.t_active_level == 1 // only report frames at level 1
2180  && !master_th->th.th_teams_microtask) { // not in teams construct
2181 #if USE_ITT_NOTIFY
2182  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2183  (__kmp_forkjoin_frames_mode == 3 ||
2184  __kmp_forkjoin_frames_mode == 1)) {
2185  kmp_uint64 tmp_time = 0;
2186  if (__itt_get_timestamp_ptr)
2187  tmp_time = __itt_get_timestamp();
2188  // Internal fork - report frame begin
2189  master_th->th.th_frame_time = tmp_time;
2190  if (__kmp_forkjoin_frames_mode == 3)
2191  team->t.t_region_time = tmp_time;
2192  } else
2193 // only one notification scheme (either "submit" or "forking/joined", not both)
2194 #endif /* USE_ITT_NOTIFY */
2195  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2196  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2197  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2198  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2199  }
2200  }
2201 #endif /* USE_ITT_BUILD */
2202 
2203  /* now go on and do the work */
2204  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2205  KMP_MB();
2206  KF_TRACE(10,
2207  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2208  root, team, master_th, gtid));
2209 
2210 #if USE_ITT_BUILD
2211  if (__itt_stack_caller_create_ptr) {
2212  team->t.t_stack_id =
2213  __kmp_itt_stack_caller_create(); // create new stack stitching id
2214  // before entering fork barrier
2215  }
2216 #endif /* USE_ITT_BUILD */
2217 
2218  // AC: skip __kmp_internal_fork at teams construct, let only master
2219  // threads execute
2220  if (ap) {
2221  __kmp_internal_fork(loc, gtid, team);
2222  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2223  "master_th=%p, gtid=%d\n",
2224  root, team, master_th, gtid));
2225  }
2226 
2227  if (call_context == fork_context_gnu) {
2228  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2229  return TRUE;
2230  }
2231 
2232  /* Invoke microtask for MASTER thread */
2233  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2234  team->t.t_id, team->t.t_pkfn));
2235  } // END of timer KMP_fork_call block
2236 
2237 #if KMP_STATS_ENABLED
2238  // If beginning a teams construct, then change thread state
2239  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2240  if (!ap) {
2241  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2242  }
2243 #endif
2244 
2245  if (!team->t.t_invoke(gtid)) {
2246  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2247  }
2248 
2249 #if KMP_STATS_ENABLED
2250  // If was beginning of a teams construct, then reset thread state
2251  if (!ap) {
2252  KMP_SET_THREAD_STATE(previous_state);
2253  }
2254 #endif
2255 
2256  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2257  team->t.t_id, team->t.t_pkfn));
2258  KMP_MB(); /* Flush all pending memory write invalidates. */
2259 
2260  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2261 
2262 #if OMPT_SUPPORT
2263  if (ompt_enabled.enabled) {
2264  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2265  }
2266 #endif
2267 
2268  return TRUE;
2269 }
2270 
2271 #if OMPT_SUPPORT
2272 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2273  kmp_team_t *team) {
2274  // restore state outside the region
2275  thread->th.ompt_thread_info.state =
2276  ((team->t.t_serialized) ? ompt_state_work_serial
2277  : ompt_state_work_parallel);
2278 }
2279 
2280 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2281  kmp_team_t *team, ompt_data_t *parallel_data,
2282  int flags, void *codeptr) {
2283  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2284  if (ompt_enabled.ompt_callback_parallel_end) {
2285  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2286  parallel_data, &(task_info->task_data), flags, codeptr);
2287  }
2288 
2289  task_info->frame.enter_frame = ompt_data_none;
2290  __kmp_join_restore_state(thread, team);
2291 }
2292 #endif
2293 
2294 void __kmp_join_call(ident_t *loc, int gtid
2295 #if OMPT_SUPPORT
2296  ,
2297  enum fork_context_e fork_context
2298 #endif
2299  ,
2300  int exit_teams) {
2301  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2302  kmp_team_t *team;
2303  kmp_team_t *parent_team;
2304  kmp_info_t *master_th;
2305  kmp_root_t *root;
2306  int master_active;
2307 
2308  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2309 
2310  /* setup current data */
2311  master_th = __kmp_threads[gtid];
2312  root = master_th->th.th_root;
2313  team = master_th->th.th_team;
2314  parent_team = team->t.t_parent;
2315 
2316  master_th->th.th_ident = loc;
2317 
2318 #if OMPT_SUPPORT
2319  void *team_microtask = (void *)team->t.t_pkfn;
2320  // For GOMP interface with serialized parallel, need the
2321  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2322  // and end-parallel events.
2323  if (ompt_enabled.enabled &&
2324  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2325  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2326  }
2327 #endif
2328 
2329 #if KMP_DEBUG
2330  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2331  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2332  "th_task_team = %p\n",
2333  __kmp_gtid_from_thread(master_th), team,
2334  team->t.t_task_team[master_th->th.th_task_state],
2335  master_th->th.th_task_team));
2336  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2337  team->t.t_task_team[master_th->th.th_task_state]);
2338  }
2339 #endif
2340 
2341  if (team->t.t_serialized) {
2342  if (master_th->th.th_teams_microtask) {
2343  // We are in teams construct
2344  int level = team->t.t_level;
2345  int tlevel = master_th->th.th_teams_level;
2346  if (level == tlevel) {
2347  // AC: we haven't incremented it earlier at start of teams construct,
2348  // so do it here - at the end of teams construct
2349  team->t.t_level++;
2350  } else if (level == tlevel + 1) {
2351  // AC: we are exiting parallel inside teams, need to increment
2352  // serialization in order to restore it in the next call to
2353  // __kmpc_end_serialized_parallel
2354  team->t.t_serialized++;
2355  }
2356  }
2357  __kmpc_end_serialized_parallel(loc, gtid);
2358 
2359 #if OMPT_SUPPORT
2360  if (ompt_enabled.enabled) {
2361  __kmp_join_restore_state(master_th, parent_team);
2362  }
2363 #endif
2364 
2365  return;
2366  }
2367 
2368  master_active = team->t.t_master_active;
2369 
2370  if (!exit_teams) {
2371  // AC: No barrier for internal teams at exit from teams construct.
2372  // But there is barrier for external team (league).
2373  __kmp_internal_join(loc, gtid, team);
2374  } else {
2375  master_th->th.th_task_state =
2376  0; // AC: no tasking in teams (out of any parallel)
2377  }
2378 
2379  KMP_MB();
2380 
2381 #if OMPT_SUPPORT
2382  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2383  void *codeptr = team->t.ompt_team_info.master_return_address;
2384 #endif
2385 
2386 #if USE_ITT_BUILD
2387  if (__itt_stack_caller_create_ptr) {
2388  // destroy the stack stitching id after join barrier
2389  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2390  }
2391  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2392  if (team->t.t_active_level == 1 &&
2393  (!master_th->th.th_teams_microtask || /* not in teams construct */
2394  master_th->th.th_teams_size.nteams == 1)) {
2395  master_th->th.th_ident = loc;
2396  // only one notification scheme (either "submit" or "forking/joined", not
2397  // both)
2398  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2399  __kmp_forkjoin_frames_mode == 3)
2400  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2401  master_th->th.th_frame_time, 0, loc,
2402  master_th->th.th_team_nproc, 1);
2403  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2404  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2405  __kmp_itt_region_joined(gtid);
2406  } // active_level == 1
2407 #endif /* USE_ITT_BUILD */
2408 
2409  if (master_th->th.th_teams_microtask && !exit_teams &&
2410  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2411  team->t.t_level == master_th->th.th_teams_level + 1) {
2412 // AC: We need to leave the team structure intact at the end of parallel
2413 // inside the teams construct, so that at the next parallel same (hot) team
2414 // works, only adjust nesting levels
2415 #if OMPT_SUPPORT
2416  ompt_data_t ompt_parallel_data = ompt_data_none;
2417  if (ompt_enabled.enabled) {
2418  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2419  if (ompt_enabled.ompt_callback_implicit_task) {
2420  int ompt_team_size = team->t.t_nproc;
2421  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2422  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2423  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2424  }
2425  task_info->frame.exit_frame = ompt_data_none;
2426  task_info->task_data = ompt_data_none;
2427  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2428  __ompt_lw_taskteam_unlink(master_th);
2429  }
2430 #endif
2431  /* Decrement our nested depth level */
2432  team->t.t_level--;
2433  team->t.t_active_level--;
2434  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2435 
2436  // Restore number of threads in the team if needed. This code relies on
2437  // the proper adjustment of th_teams_size.nth after the fork in
2438  // __kmp_teams_master on each teams master in the case that
2439  // __kmp_reserve_threads reduced it.
2440  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2441  int old_num = master_th->th.th_team_nproc;
2442  int new_num = master_th->th.th_teams_size.nth;
2443  kmp_info_t **other_threads = team->t.t_threads;
2444  team->t.t_nproc = new_num;
2445  for (int i = 0; i < old_num; ++i) {
2446  other_threads[i]->th.th_team_nproc = new_num;
2447  }
2448  // Adjust states of non-used threads of the team
2449  for (int i = old_num; i < new_num; ++i) {
2450  // Re-initialize thread's barrier data.
2451  KMP_DEBUG_ASSERT(other_threads[i]);
2452  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2453  for (int b = 0; b < bs_last_barrier; ++b) {
2454  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2455  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2456 #if USE_DEBUGGER
2457  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2458 #endif
2459  }
2460  if (__kmp_tasking_mode != tskm_immediate_exec) {
2461  // Synchronize thread's task state
2462  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2463  }
2464  }
2465  }
2466 
2467 #if OMPT_SUPPORT
2468  if (ompt_enabled.enabled) {
2469  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2470  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2471  }
2472 #endif
2473 
2474  return;
2475  }
2476 
2477  /* do cleanup and restore the parent team */
2478  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2479  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2480 
2481  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2482 
2483  /* jc: The following lock has instructions with REL and ACQ semantics,
2484  separating the parallel user code called in this parallel region
2485  from the serial user code called after this function returns. */
2486  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2487 
2488  if (!master_th->th.th_teams_microtask ||
2489  team->t.t_level > master_th->th.th_teams_level) {
2490  /* Decrement our nested depth level */
2491  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2492  }
2493  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2494 
2495 #if OMPT_SUPPORT
2496  if (ompt_enabled.enabled) {
2497  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2498  if (ompt_enabled.ompt_callback_implicit_task) {
2499  int flags = (team_microtask == (void *)__kmp_teams_master)
2500  ? ompt_task_initial
2501  : ompt_task_implicit;
2502  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2503  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2504  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2505  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2506  }
2507  task_info->frame.exit_frame = ompt_data_none;
2508  task_info->task_data = ompt_data_none;
2509  }
2510 #endif
2511 
2512  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2513  master_th, team));
2514  __kmp_pop_current_task_from_thread(master_th);
2515 
2516 #if KMP_AFFINITY_SUPPORTED
2517  // Restore master thread's partition.
2518  master_th->th.th_first_place = team->t.t_first_place;
2519  master_th->th.th_last_place = team->t.t_last_place;
2520 #endif // KMP_AFFINITY_SUPPORTED
2521  master_th->th.th_def_allocator = team->t.t_def_allocator;
2522 
2523  updateHWFPControl(team);
2524 
2525  if (root->r.r_active != master_active)
2526  root->r.r_active = master_active;
2527 
2528  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2529  master_th)); // this will free worker threads
2530 
2531  /* this race was fun to find. make sure the following is in the critical
2532  region otherwise assertions may fail occasionally since the old team may be
2533  reallocated and the hierarchy appears inconsistent. it is actually safe to
2534  run and won't cause any bugs, but will cause those assertion failures. it's
2535  only one deref&assign so might as well put this in the critical region */
2536  master_th->th.th_team = parent_team;
2537  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2538  master_th->th.th_team_master = parent_team->t.t_threads[0];
2539  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2540 
2541  /* restore serialized team, if need be */
2542  if (parent_team->t.t_serialized &&
2543  parent_team != master_th->th.th_serial_team &&
2544  parent_team != root->r.r_root_team) {
2545  __kmp_free_team(root,
2546  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2547  master_th->th.th_serial_team = parent_team;
2548  }
2549 
2550  if (__kmp_tasking_mode != tskm_immediate_exec) {
2551  if (master_th->th.th_task_state_top >
2552  0) { // Restore task state from memo stack
2553  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2554  // Remember master's state if we re-use this nested hot team
2555  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2556  master_th->th.th_task_state;
2557  --master_th->th.th_task_state_top; // pop
2558  // Now restore state at this level
2559  master_th->th.th_task_state =
2560  master_th->th
2561  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2562  }
2563  // Copy the task team from the parent team to the master thread
2564  master_th->th.th_task_team =
2565  parent_team->t.t_task_team[master_th->th.th_task_state];
2566  KA_TRACE(20,
2567  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2568  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2569  parent_team));
2570  }
2571 
2572  // TODO: GEH - cannot do this assertion because root thread not set up as
2573  // executing
2574  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2575  master_th->th.th_current_task->td_flags.executing = 1;
2576 
2577  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2578 
2579 #if OMPT_SUPPORT
2580  int flags =
2581  OMPT_INVOKER(fork_context) |
2582  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2583  : ompt_parallel_team);
2584  if (ompt_enabled.enabled) {
2585  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2586  codeptr);
2587  }
2588 #endif
2589 
2590  KMP_MB();
2591  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2592 }
2593 
2594 /* Check whether we should push an internal control record onto the
2595  serial team stack. If so, do it. */
2596 void __kmp_save_internal_controls(kmp_info_t *thread) {
2597 
2598  if (thread->th.th_team != thread->th.th_serial_team) {
2599  return;
2600  }
2601  if (thread->th.th_team->t.t_serialized > 1) {
2602  int push = 0;
2603 
2604  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2605  push = 1;
2606  } else {
2607  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2608  thread->th.th_team->t.t_serialized) {
2609  push = 1;
2610  }
2611  }
2612  if (push) { /* push a record on the serial team's stack */
2613  kmp_internal_control_t *control =
2614  (kmp_internal_control_t *)__kmp_allocate(
2615  sizeof(kmp_internal_control_t));
2616 
2617  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2618 
2619  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2620 
2621  control->next = thread->th.th_team->t.t_control_stack_top;
2622  thread->th.th_team->t.t_control_stack_top = control;
2623  }
2624  }
2625 }
2626 
2627 /* Changes set_nproc */
2628 void __kmp_set_num_threads(int new_nth, int gtid) {
2629  kmp_info_t *thread;
2630  kmp_root_t *root;
2631 
2632  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2633  KMP_DEBUG_ASSERT(__kmp_init_serial);
2634 
2635  if (new_nth < 1)
2636  new_nth = 1;
2637  else if (new_nth > __kmp_max_nth)
2638  new_nth = __kmp_max_nth;
2639 
2640  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2641  thread = __kmp_threads[gtid];
2642  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2643  return; // nothing to do
2644 
2645  __kmp_save_internal_controls(thread);
2646 
2647  set__nproc(thread, new_nth);
2648 
2649  // If this omp_set_num_threads() call will cause the hot team size to be
2650  // reduced (in the absence of a num_threads clause), then reduce it now,
2651  // rather than waiting for the next parallel region.
2652  root = thread->th.th_root;
2653  if (__kmp_init_parallel && (!root->r.r_active) &&
2654  (root->r.r_hot_team->t.t_nproc > new_nth)
2655 #if KMP_NESTED_HOT_TEAMS
2656  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2657 #endif
2658  ) {
2659  kmp_team_t *hot_team = root->r.r_hot_team;
2660  int f;
2661 
2662  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2663 
2664  // Release the extra threads we don't need any more.
2665  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2666  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2667  if (__kmp_tasking_mode != tskm_immediate_exec) {
2668  // When decreasing team size, threads no longer in the team should unref
2669  // task team.
2670  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2671  }
2672  __kmp_free_thread(hot_team->t.t_threads[f]);
2673  hot_team->t.t_threads[f] = NULL;
2674  }
2675  hot_team->t.t_nproc = new_nth;
2676 #if KMP_NESTED_HOT_TEAMS
2677  if (thread->th.th_hot_teams) {
2678  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2679  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2680  }
2681 #endif
2682 
2683  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2684 
2685  // Update the t_nproc field in the threads that are still active.
2686  for (f = 0; f < new_nth; f++) {
2687  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2688  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2689  }
2690  // Special flag in case omp_set_num_threads() call
2691  hot_team->t.t_size_changed = -1;
2692  }
2693 }
2694 
2695 /* Changes max_active_levels */
2696 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2697  kmp_info_t *thread;
2698 
2699  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2700  "%d = (%d)\n",
2701  gtid, max_active_levels));
2702  KMP_DEBUG_ASSERT(__kmp_init_serial);
2703 
2704  // validate max_active_levels
2705  if (max_active_levels < 0) {
2706  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2707  // We ignore this call if the user has specified a negative value.
2708  // The current setting won't be changed. The last valid setting will be
2709  // used. A warning will be issued (if warnings are allowed as controlled by
2710  // the KMP_WARNINGS env var).
2711  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2712  "max_active_levels for thread %d = (%d)\n",
2713  gtid, max_active_levels));
2714  return;
2715  }
2716  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2717  // it's OK, the max_active_levels is within the valid range: [ 0;
2718  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2719  // We allow a zero value. (implementation defined behavior)
2720  } else {
2721  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2722  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2723  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2724  // Current upper limit is MAX_INT. (implementation defined behavior)
2725  // If the input exceeds the upper limit, we correct the input to be the
2726  // upper limit. (implementation defined behavior)
2727  // Actually, the flow should never get here until we use MAX_INT limit.
2728  }
2729  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2730  "max_active_levels for thread %d = (%d)\n",
2731  gtid, max_active_levels));
2732 
2733  thread = __kmp_threads[gtid];
2734 
2735  __kmp_save_internal_controls(thread);
2736 
2737  set__max_active_levels(thread, max_active_levels);
2738 }
2739 
2740 /* Gets max_active_levels */
2741 int __kmp_get_max_active_levels(int gtid) {
2742  kmp_info_t *thread;
2743 
2744  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2745  KMP_DEBUG_ASSERT(__kmp_init_serial);
2746 
2747  thread = __kmp_threads[gtid];
2748  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2749  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2750  "curtask_maxaclevel=%d\n",
2751  gtid, thread->th.th_current_task,
2752  thread->th.th_current_task->td_icvs.max_active_levels));
2753  return thread->th.th_current_task->td_icvs.max_active_levels;
2754 }
2755 
2756 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2757 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2758 
2759 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2760 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2761  kmp_info_t *thread;
2762  kmp_sched_t orig_kind;
2763  // kmp_team_t *team;
2764 
2765  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2766  gtid, (int)kind, chunk));
2767  KMP_DEBUG_ASSERT(__kmp_init_serial);
2768 
2769  // Check if the kind parameter is valid, correct if needed.
2770  // Valid parameters should fit in one of two intervals - standard or extended:
2771  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2772  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2773  orig_kind = kind;
2774  kind = __kmp_sched_without_mods(kind);
2775 
2776  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2777  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2778  // TODO: Hint needs attention in case we change the default schedule.
2779  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2780  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2781  __kmp_msg_null);
2782  kind = kmp_sched_default;
2783  chunk = 0; // ignore chunk value in case of bad kind
2784  }
2785 
2786  thread = __kmp_threads[gtid];
2787 
2788  __kmp_save_internal_controls(thread);
2789 
2790  if (kind < kmp_sched_upper_std) {
2791  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2792  // differ static chunked vs. unchunked: chunk should be invalid to
2793  // indicate unchunked schedule (which is the default)
2794  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2795  } else {
2796  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2797  __kmp_sch_map[kind - kmp_sched_lower - 1];
2798  }
2799  } else {
2800  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2801  // kmp_sched_lower - 2 ];
2802  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2803  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2804  kmp_sched_lower - 2];
2805  }
2806  __kmp_sched_apply_mods_intkind(
2807  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2808  if (kind == kmp_sched_auto || chunk < 1) {
2809  // ignore parameter chunk for schedule auto
2810  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2811  } else {
2812  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2813  }
2814 }
2815 
2816 /* Gets def_sched_var ICV values */
2817 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2818  kmp_info_t *thread;
2819  enum sched_type th_type;
2820 
2821  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2822  KMP_DEBUG_ASSERT(__kmp_init_serial);
2823 
2824  thread = __kmp_threads[gtid];
2825 
2826  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2827  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2828  case kmp_sch_static:
2829  case kmp_sch_static_greedy:
2830  case kmp_sch_static_balanced:
2831  *kind = kmp_sched_static;
2832  __kmp_sched_apply_mods_stdkind(kind, th_type);
2833  *chunk = 0; // chunk was not set, try to show this fact via zero value
2834  return;
2835  case kmp_sch_static_chunked:
2836  *kind = kmp_sched_static;
2837  break;
2838  case kmp_sch_dynamic_chunked:
2839  *kind = kmp_sched_dynamic;
2840  break;
2842  case kmp_sch_guided_iterative_chunked:
2843  case kmp_sch_guided_analytical_chunked:
2844  *kind = kmp_sched_guided;
2845  break;
2846  case kmp_sch_auto:
2847  *kind = kmp_sched_auto;
2848  break;
2849  case kmp_sch_trapezoidal:
2850  *kind = kmp_sched_trapezoidal;
2851  break;
2852 #if KMP_STATIC_STEAL_ENABLED
2853  case kmp_sch_static_steal:
2854  *kind = kmp_sched_static_steal;
2855  break;
2856 #endif
2857  default:
2858  KMP_FATAL(UnknownSchedulingType, th_type);
2859  }
2860 
2861  __kmp_sched_apply_mods_stdkind(kind, th_type);
2862  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2863 }
2864 
2865 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2866 
2867  int ii, dd;
2868  kmp_team_t *team;
2869  kmp_info_t *thr;
2870 
2871  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2872  KMP_DEBUG_ASSERT(__kmp_init_serial);
2873 
2874  // validate level
2875  if (level == 0)
2876  return 0;
2877  if (level < 0)
2878  return -1;
2879  thr = __kmp_threads[gtid];
2880  team = thr->th.th_team;
2881  ii = team->t.t_level;
2882  if (level > ii)
2883  return -1;
2884 
2885  if (thr->th.th_teams_microtask) {
2886  // AC: we are in teams region where multiple nested teams have same level
2887  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2888  if (level <=
2889  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2890  KMP_DEBUG_ASSERT(ii >= tlevel);
2891  // AC: As we need to pass by the teams league, we need to artificially
2892  // increase ii
2893  if (ii == tlevel) {
2894  ii += 2; // three teams have same level
2895  } else {
2896  ii++; // two teams have same level
2897  }
2898  }
2899  }
2900 
2901  if (ii == level)
2902  return __kmp_tid_from_gtid(gtid);
2903 
2904  dd = team->t.t_serialized;
2905  level++;
2906  while (ii > level) {
2907  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2908  }
2909  if ((team->t.t_serialized) && (!dd)) {
2910  team = team->t.t_parent;
2911  continue;
2912  }
2913  if (ii > level) {
2914  team = team->t.t_parent;
2915  dd = team->t.t_serialized;
2916  ii--;
2917  }
2918  }
2919 
2920  return (dd > 1) ? (0) : (team->t.t_master_tid);
2921 }
2922 
2923 int __kmp_get_team_size(int gtid, int level) {
2924 
2925  int ii, dd;
2926  kmp_team_t *team;
2927  kmp_info_t *thr;
2928 
2929  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2930  KMP_DEBUG_ASSERT(__kmp_init_serial);
2931 
2932  // validate level
2933  if (level == 0)
2934  return 1;
2935  if (level < 0)
2936  return -1;
2937  thr = __kmp_threads[gtid];
2938  team = thr->th.th_team;
2939  ii = team->t.t_level;
2940  if (level > ii)
2941  return -1;
2942 
2943  if (thr->th.th_teams_microtask) {
2944  // AC: we are in teams region where multiple nested teams have same level
2945  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2946  if (level <=
2947  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2948  KMP_DEBUG_ASSERT(ii >= tlevel);
2949  // AC: As we need to pass by the teams league, we need to artificially
2950  // increase ii
2951  if (ii == tlevel) {
2952  ii += 2; // three teams have same level
2953  } else {
2954  ii++; // two teams have same level
2955  }
2956  }
2957  }
2958 
2959  while (ii > level) {
2960  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2961  }
2962  if (team->t.t_serialized && (!dd)) {
2963  team = team->t.t_parent;
2964  continue;
2965  }
2966  if (ii > level) {
2967  team = team->t.t_parent;
2968  ii--;
2969  }
2970  }
2971 
2972  return team->t.t_nproc;
2973 }
2974 
2975 kmp_r_sched_t __kmp_get_schedule_global() {
2976  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2977  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2978  // independently. So one can get the updated schedule here.
2979 
2980  kmp_r_sched_t r_sched;
2981 
2982  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2983  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2984  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2985  // different roots (even in OMP 2.5)
2986  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2987  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2988  if (s == kmp_sch_static) {
2989  // replace STATIC with more detailed schedule (balanced or greedy)
2990  r_sched.r_sched_type = __kmp_static;
2991  } else if (s == kmp_sch_guided_chunked) {
2992  // replace GUIDED with more detailed schedule (iterative or analytical)
2993  r_sched.r_sched_type = __kmp_guided;
2994  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2995  r_sched.r_sched_type = __kmp_sched;
2996  }
2997  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2998 
2999  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3000  // __kmp_chunk may be wrong here (if it was not ever set)
3001  r_sched.chunk = KMP_DEFAULT_CHUNK;
3002  } else {
3003  r_sched.chunk = __kmp_chunk;
3004  }
3005 
3006  return r_sched;
3007 }
3008 
3009 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3010  at least argc number of *t_argv entries for the requested team. */
3011 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3012 
3013  KMP_DEBUG_ASSERT(team);
3014  if (!realloc || argc > team->t.t_max_argc) {
3015 
3016  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3017  "current entries=%d\n",
3018  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3019  /* if previously allocated heap space for args, free them */
3020  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3021  __kmp_free((void *)team->t.t_argv);
3022 
3023  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3024  /* use unused space in the cache line for arguments */
3025  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3026  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3027  "argv entries\n",
3028  team->t.t_id, team->t.t_max_argc));
3029  team->t.t_argv = &team->t.t_inline_argv[0];
3030  if (__kmp_storage_map) {
3031  __kmp_print_storage_map_gtid(
3032  -1, &team->t.t_inline_argv[0],
3033  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3034  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3035  team->t.t_id);
3036  }
3037  } else {
3038  /* allocate space for arguments in the heap */
3039  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3040  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3041  : 2 * argc;
3042  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3043  "argv entries\n",
3044  team->t.t_id, team->t.t_max_argc));
3045  team->t.t_argv =
3046  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3047  if (__kmp_storage_map) {
3048  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3049  &team->t.t_argv[team->t.t_max_argc],
3050  sizeof(void *) * team->t.t_max_argc,
3051  "team_%d.t_argv", team->t.t_id);
3052  }
3053  }
3054  }
3055 }
3056 
3057 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3058  int i;
3059  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3060  team->t.t_threads =
3061  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3062  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3063  sizeof(dispatch_shared_info_t) * num_disp_buff);
3064  team->t.t_dispatch =
3065  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3066  team->t.t_implicit_task_taskdata =
3067  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3068  team->t.t_max_nproc = max_nth;
3069 
3070  /* setup dispatch buffers */
3071  for (i = 0; i < num_disp_buff; ++i) {
3072  team->t.t_disp_buffer[i].buffer_index = i;
3073  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3074  }
3075 }
3076 
3077 static void __kmp_free_team_arrays(kmp_team_t *team) {
3078  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3079  int i;
3080  for (i = 0; i < team->t.t_max_nproc; ++i) {
3081  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3082  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3083  team->t.t_dispatch[i].th_disp_buffer = NULL;
3084  }
3085  }
3086 #if KMP_USE_HIER_SCHED
3087  __kmp_dispatch_free_hierarchies(team);
3088 #endif
3089  __kmp_free(team->t.t_threads);
3090  __kmp_free(team->t.t_disp_buffer);
3091  __kmp_free(team->t.t_dispatch);
3092  __kmp_free(team->t.t_implicit_task_taskdata);
3093  team->t.t_threads = NULL;
3094  team->t.t_disp_buffer = NULL;
3095  team->t.t_dispatch = NULL;
3096  team->t.t_implicit_task_taskdata = 0;
3097 }
3098 
3099 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3100  kmp_info_t **oldThreads = team->t.t_threads;
3101 
3102  __kmp_free(team->t.t_disp_buffer);
3103  __kmp_free(team->t.t_dispatch);
3104  __kmp_free(team->t.t_implicit_task_taskdata);
3105  __kmp_allocate_team_arrays(team, max_nth);
3106 
3107  KMP_MEMCPY(team->t.t_threads, oldThreads,
3108  team->t.t_nproc * sizeof(kmp_info_t *));
3109 
3110  __kmp_free(oldThreads);
3111 }
3112 
3113 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3114 
3115  kmp_r_sched_t r_sched =
3116  __kmp_get_schedule_global(); // get current state of scheduling globals
3117 
3118  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3119 
3120  kmp_internal_control_t g_icvs = {
3121  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3122  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3123  // adjustment of threads (per thread)
3124  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3125  // whether blocktime is explicitly set
3126  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3127 #if KMP_USE_MONITOR
3128  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3129 // intervals
3130 #endif
3131  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3132  // next parallel region (per thread)
3133  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3134  __kmp_cg_max_nth, // int thread_limit;
3135  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3136  // for max_active_levels
3137  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3138  // {sched,chunk} pair
3139  __kmp_nested_proc_bind.bind_types[0],
3140  __kmp_default_device,
3141  NULL // struct kmp_internal_control *next;
3142  };
3143 
3144  return g_icvs;
3145 }
3146 
3147 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3148 
3149  kmp_internal_control_t gx_icvs;
3150  gx_icvs.serial_nesting_level =
3151  0; // probably =team->t.t_serial like in save_inter_controls
3152  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3153  gx_icvs.next = NULL;
3154 
3155  return gx_icvs;
3156 }
3157 
3158 static void __kmp_initialize_root(kmp_root_t *root) {
3159  int f;
3160  kmp_team_t *root_team;
3161  kmp_team_t *hot_team;
3162  int hot_team_max_nth;
3163  kmp_r_sched_t r_sched =
3164  __kmp_get_schedule_global(); // get current state of scheduling globals
3165  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3166  KMP_DEBUG_ASSERT(root);
3167  KMP_ASSERT(!root->r.r_begin);
3168 
3169  /* setup the root state structure */
3170  __kmp_init_lock(&root->r.r_begin_lock);
3171  root->r.r_begin = FALSE;
3172  root->r.r_active = FALSE;
3173  root->r.r_in_parallel = 0;
3174  root->r.r_blocktime = __kmp_dflt_blocktime;
3175 
3176  /* setup the root team for this task */
3177  /* allocate the root team structure */
3178  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3179 
3180  root_team =
3181  __kmp_allocate_team(root,
3182  1, // new_nproc
3183  1, // max_nproc
3184 #if OMPT_SUPPORT
3185  ompt_data_none, // root parallel id
3186 #endif
3187  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3188  0 // argc
3189  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3190  );
3191 #if USE_DEBUGGER
3192  // Non-NULL value should be assigned to make the debugger display the root
3193  // team.
3194  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3195 #endif
3196 
3197  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3198 
3199  root->r.r_root_team = root_team;
3200  root_team->t.t_control_stack_top = NULL;
3201 
3202  /* initialize root team */
3203  root_team->t.t_threads[0] = NULL;
3204  root_team->t.t_nproc = 1;
3205  root_team->t.t_serialized = 1;
3206  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3207  root_team->t.t_sched.sched = r_sched.sched;
3208  KA_TRACE(
3209  20,
3210  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3211  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3212 
3213  /* setup the hot team for this task */
3214  /* allocate the hot team structure */
3215  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3216 
3217  hot_team =
3218  __kmp_allocate_team(root,
3219  1, // new_nproc
3220  __kmp_dflt_team_nth_ub * 2, // max_nproc
3221 #if OMPT_SUPPORT
3222  ompt_data_none, // root parallel id
3223 #endif
3224  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3225  0 // argc
3226  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3227  );
3228  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3229 
3230  root->r.r_hot_team = hot_team;
3231  root_team->t.t_control_stack_top = NULL;
3232 
3233  /* first-time initialization */
3234  hot_team->t.t_parent = root_team;
3235 
3236  /* initialize hot team */
3237  hot_team_max_nth = hot_team->t.t_max_nproc;
3238  for (f = 0; f < hot_team_max_nth; ++f) {
3239  hot_team->t.t_threads[f] = NULL;
3240  }
3241  hot_team->t.t_nproc = 1;
3242  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3243  hot_team->t.t_sched.sched = r_sched.sched;
3244  hot_team->t.t_size_changed = 0;
3245 }
3246 
3247 #ifdef KMP_DEBUG
3248 
3249 typedef struct kmp_team_list_item {
3250  kmp_team_p const *entry;
3251  struct kmp_team_list_item *next;
3252 } kmp_team_list_item_t;
3253 typedef kmp_team_list_item_t *kmp_team_list_t;
3254 
3255 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3256  kmp_team_list_t list, // List of teams.
3257  kmp_team_p const *team // Team to add.
3258  ) {
3259 
3260  // List must terminate with item where both entry and next are NULL.
3261  // Team is added to the list only once.
3262  // List is sorted in ascending order by team id.
3263  // Team id is *not* a key.
3264 
3265  kmp_team_list_t l;
3266 
3267  KMP_DEBUG_ASSERT(list != NULL);
3268  if (team == NULL) {
3269  return;
3270  }
3271 
3272  __kmp_print_structure_team_accum(list, team->t.t_parent);
3273  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3274 
3275  // Search list for the team.
3276  l = list;
3277  while (l->next != NULL && l->entry != team) {
3278  l = l->next;
3279  }
3280  if (l->next != NULL) {
3281  return; // Team has been added before, exit.
3282  }
3283 
3284  // Team is not found. Search list again for insertion point.
3285  l = list;
3286  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3287  l = l->next;
3288  }
3289 
3290  // Insert team.
3291  {
3292  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3293  sizeof(kmp_team_list_item_t));
3294  *item = *l;
3295  l->entry = team;
3296  l->next = item;
3297  }
3298 }
3299 
3300 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3301 
3302  ) {
3303  __kmp_printf("%s", title);
3304  if (team != NULL) {
3305  __kmp_printf("%2x %p\n", team->t.t_id, team);
3306  } else {
3307  __kmp_printf(" - (nil)\n");
3308  }
3309 }
3310 
3311 static void __kmp_print_structure_thread(char const *title,
3312  kmp_info_p const *thread) {
3313  __kmp_printf("%s", title);
3314  if (thread != NULL) {
3315  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3316  } else {
3317  __kmp_printf(" - (nil)\n");
3318  }
3319 }
3320 
3321 void __kmp_print_structure(void) {
3322 
3323  kmp_team_list_t list;
3324 
3325  // Initialize list of teams.
3326  list =
3327  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3328  list->entry = NULL;
3329  list->next = NULL;
3330 
3331  __kmp_printf("\n------------------------------\nGlobal Thread "
3332  "Table\n------------------------------\n");
3333  {
3334  int gtid;
3335  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3336  __kmp_printf("%2d", gtid);
3337  if (__kmp_threads != NULL) {
3338  __kmp_printf(" %p", __kmp_threads[gtid]);
3339  }
3340  if (__kmp_root != NULL) {
3341  __kmp_printf(" %p", __kmp_root[gtid]);
3342  }
3343  __kmp_printf("\n");
3344  }
3345  }
3346 
3347  // Print out __kmp_threads array.
3348  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3349  "----------\n");
3350  if (__kmp_threads != NULL) {
3351  int gtid;
3352  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3353  kmp_info_t const *thread = __kmp_threads[gtid];
3354  if (thread != NULL) {
3355  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3356  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3357  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3358  __kmp_print_structure_team(" Serial Team: ",
3359  thread->th.th_serial_team);
3360  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3361  __kmp_print_structure_thread(" Master: ",
3362  thread->th.th_team_master);
3363  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3364  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3365  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3366  __kmp_print_structure_thread(" Next in pool: ",
3367  thread->th.th_next_pool);
3368  __kmp_printf("\n");
3369  __kmp_print_structure_team_accum(list, thread->th.th_team);
3370  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3371  }
3372  }
3373  } else {
3374  __kmp_printf("Threads array is not allocated.\n");
3375  }
3376 
3377  // Print out __kmp_root array.
3378  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3379  "--------\n");
3380  if (__kmp_root != NULL) {
3381  int gtid;
3382  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3383  kmp_root_t const *root = __kmp_root[gtid];
3384  if (root != NULL) {
3385  __kmp_printf("GTID %2d %p:\n", gtid, root);
3386  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3387  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3388  __kmp_print_structure_thread(" Uber Thread: ",
3389  root->r.r_uber_thread);
3390  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3391  __kmp_printf(" In Parallel: %2d\n",
3392  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3393  __kmp_printf("\n");
3394  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3395  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3396  }
3397  }
3398  } else {
3399  __kmp_printf("Ubers array is not allocated.\n");
3400  }
3401 
3402  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3403  "--------\n");
3404  while (list->next != NULL) {
3405  kmp_team_p const *team = list->entry;
3406  int i;
3407  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3408  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3409  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3410  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3411  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3412  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3413  for (i = 0; i < team->t.t_nproc; ++i) {
3414  __kmp_printf(" Thread %2d: ", i);
3415  __kmp_print_structure_thread("", team->t.t_threads[i]);
3416  }
3417  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3418  __kmp_printf("\n");
3419  list = list->next;
3420  }
3421 
3422  // Print out __kmp_thread_pool and __kmp_team_pool.
3423  __kmp_printf("\n------------------------------\nPools\n----------------------"
3424  "--------\n");
3425  __kmp_print_structure_thread("Thread pool: ",
3426  CCAST(kmp_info_t *, __kmp_thread_pool));
3427  __kmp_print_structure_team("Team pool: ",
3428  CCAST(kmp_team_t *, __kmp_team_pool));
3429  __kmp_printf("\n");
3430 
3431  // Free team list.
3432  while (list != NULL) {
3433  kmp_team_list_item_t *item = list;
3434  list = list->next;
3435  KMP_INTERNAL_FREE(item);
3436  }
3437 }
3438 
3439 #endif
3440 
3441 //---------------------------------------------------------------------------
3442 // Stuff for per-thread fast random number generator
3443 // Table of primes
3444 static const unsigned __kmp_primes[] = {
3445  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3446  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3447  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3448  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3449  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3450  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3451  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3452  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3453  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3454  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3455  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3456 
3457 //---------------------------------------------------------------------------
3458 // __kmp_get_random: Get a random number using a linear congruential method.
3459 unsigned short __kmp_get_random(kmp_info_t *thread) {
3460  unsigned x = thread->th.th_x;
3461  unsigned short r = (unsigned short)(x >> 16);
3462 
3463  thread->th.th_x = x * thread->th.th_a + 1;
3464 
3465  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3466  thread->th.th_info.ds.ds_tid, r));
3467 
3468  return r;
3469 }
3470 //--------------------------------------------------------
3471 // __kmp_init_random: Initialize a random number generator
3472 void __kmp_init_random(kmp_info_t *thread) {
3473  unsigned seed = thread->th.th_info.ds.ds_tid;
3474 
3475  thread->th.th_a =
3476  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3477  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3478  KA_TRACE(30,
3479  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3480 }
3481 
3482 #if KMP_OS_WINDOWS
3483 /* reclaim array entries for root threads that are already dead, returns number
3484  * reclaimed */
3485 static int __kmp_reclaim_dead_roots(void) {
3486  int i, r = 0;
3487 
3488  for (i = 0; i < __kmp_threads_capacity; ++i) {
3489  if (KMP_UBER_GTID(i) &&
3490  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3491  !__kmp_root[i]
3492  ->r.r_active) { // AC: reclaim only roots died in non-active state
3493  r += __kmp_unregister_root_other_thread(i);
3494  }
3495  }
3496  return r;
3497 }
3498 #endif
3499 
3500 /* This function attempts to create free entries in __kmp_threads and
3501  __kmp_root, and returns the number of free entries generated.
3502 
3503  For Windows* OS static library, the first mechanism used is to reclaim array
3504  entries for root threads that are already dead.
3505 
3506  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3507  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3508  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3509  threadprivate cache array has been created. Synchronization with
3510  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3511 
3512  After any dead root reclamation, if the clipping value allows array expansion
3513  to result in the generation of a total of nNeed free slots, the function does
3514  that expansion. If not, nothing is done beyond the possible initial root
3515  thread reclamation.
3516 
3517  If any argument is negative, the behavior is undefined. */
3518 static int __kmp_expand_threads(int nNeed) {
3519  int added = 0;
3520  int minimumRequiredCapacity;
3521  int newCapacity;
3522  kmp_info_t **newThreads;
3523  kmp_root_t **newRoot;
3524 
3525 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3526 // resizing __kmp_threads does not need additional protection if foreign
3527 // threads are present
3528 
3529 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3530  /* only for Windows static library */
3531  /* reclaim array entries for root threads that are already dead */
3532  added = __kmp_reclaim_dead_roots();
3533 
3534  if (nNeed) {
3535  nNeed -= added;
3536  if (nNeed < 0)
3537  nNeed = 0;
3538  }
3539 #endif
3540  if (nNeed <= 0)
3541  return added;
3542 
3543  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3544  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3545  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3546  // > __kmp_max_nth in one of two ways:
3547  //
3548  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3549  // may not be reused by another thread, so we may need to increase
3550  // __kmp_threads_capacity to __kmp_max_nth + 1.
3551  //
3552  // 2) New foreign root(s) are encountered. We always register new foreign
3553  // roots. This may cause a smaller # of threads to be allocated at
3554  // subsequent parallel regions, but the worker threads hang around (and
3555  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3556  //
3557  // Anyway, that is the reason for moving the check to see if
3558  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3559  // instead of having it performed here. -BB
3560 
3561  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3562 
3563  /* compute expansion headroom to check if we can expand */
3564  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3565  /* possible expansion too small -- give up */
3566  return added;
3567  }
3568  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3569 
3570  newCapacity = __kmp_threads_capacity;
3571  do {
3572  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3573  : __kmp_sys_max_nth;
3574  } while (newCapacity < minimumRequiredCapacity);
3575  newThreads = (kmp_info_t **)__kmp_allocate(
3576  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3577  newRoot =
3578  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3579  KMP_MEMCPY(newThreads, __kmp_threads,
3580  __kmp_threads_capacity * sizeof(kmp_info_t *));
3581  KMP_MEMCPY(newRoot, __kmp_root,
3582  __kmp_threads_capacity * sizeof(kmp_root_t *));
3583 
3584  kmp_info_t **temp_threads = __kmp_threads;
3585  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3586  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3587  __kmp_free(temp_threads);
3588  added += newCapacity - __kmp_threads_capacity;
3589  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3590 
3591  if (newCapacity > __kmp_tp_capacity) {
3592  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3593  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3594  __kmp_threadprivate_resize_cache(newCapacity);
3595  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3596  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3597  }
3598  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3599  }
3600 
3601  return added;
3602 }
3603 
3604 /* Register the current thread as a root thread and obtain our gtid. We must
3605  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3606  thread that calls from __kmp_do_serial_initialize() */
3607 int __kmp_register_root(int initial_thread) {
3608  kmp_info_t *root_thread;
3609  kmp_root_t *root;
3610  int gtid;
3611  int capacity;
3612  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3613  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3614  KMP_MB();
3615 
3616  /* 2007-03-02:
3617  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3618  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3619  work as expected -- it may return false (that means there is at least one
3620  empty slot in __kmp_threads array), but it is possible the only free slot
3621  is #0, which is reserved for initial thread and so cannot be used for this
3622  one. Following code workarounds this bug.
3623 
3624  However, right solution seems to be not reserving slot #0 for initial
3625  thread because:
3626  (1) there is no magic in slot #0,
3627  (2) we cannot detect initial thread reliably (the first thread which does
3628  serial initialization may be not a real initial thread).
3629  */
3630  capacity = __kmp_threads_capacity;
3631  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3632  --capacity;
3633  }
3634 
3635  /* see if there are too many threads */
3636  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3637  if (__kmp_tp_cached) {
3638  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3639  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3640  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3641  } else {
3642  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3643  __kmp_msg_null);
3644  }
3645  }
3646 
3647  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3648  // 0: initial thread, also a regular OpenMP thread.
3649  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3650  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3651  // regular OpenMP threads.
3652  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3653  // Find an available thread slot for hidden helper thread. Slots for hidden
3654  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3655  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3656  gtid <= __kmp_hidden_helper_threads_num;
3657  gtid++)
3658  ;
3659  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3660  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3661  "hidden helper thread: T#%d\n",
3662  gtid));
3663  } else {
3664  /* find an available thread slot */
3665  // Don't reassign the zero slot since we need that to only be used by
3666  // initial thread. Slots for hidden helper threads should also be skipped.
3667  if (initial_thread && __kmp_threads[0] == NULL) {
3668  gtid = 0;
3669  } else {
3670  for (gtid = __kmp_hidden_helper_threads_num + 1;
3671  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3672  ;
3673  }
3674  KA_TRACE(
3675  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3676  KMP_ASSERT(gtid < __kmp_threads_capacity);
3677  }
3678 
3679  /* update global accounting */
3680  __kmp_all_nth++;
3681  TCW_4(__kmp_nth, __kmp_nth + 1);
3682 
3683  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3684  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3685  if (__kmp_adjust_gtid_mode) {
3686  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3687  if (TCR_4(__kmp_gtid_mode) != 2) {
3688  TCW_4(__kmp_gtid_mode, 2);
3689  }
3690  } else {
3691  if (TCR_4(__kmp_gtid_mode) != 1) {
3692  TCW_4(__kmp_gtid_mode, 1);
3693  }
3694  }
3695  }
3696 
3697 #ifdef KMP_ADJUST_BLOCKTIME
3698  /* Adjust blocktime to zero if necessary */
3699  /* Middle initialization might not have occurred yet */
3700  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3701  if (__kmp_nth > __kmp_avail_proc) {
3702  __kmp_zero_bt = TRUE;
3703  }
3704  }
3705 #endif /* KMP_ADJUST_BLOCKTIME */
3706 
3707  /* setup this new hierarchy */
3708  if (!(root = __kmp_root[gtid])) {
3709  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3710  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3711  }
3712 
3713 #if KMP_STATS_ENABLED
3714  // Initialize stats as soon as possible (right after gtid assignment).
3715  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3716  __kmp_stats_thread_ptr->startLife();
3717  KMP_SET_THREAD_STATE(SERIAL_REGION);
3718  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3719 #endif
3720  __kmp_initialize_root(root);
3721 
3722  /* setup new root thread structure */
3723  if (root->r.r_uber_thread) {
3724  root_thread = root->r.r_uber_thread;
3725  } else {
3726  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3727  if (__kmp_storage_map) {
3728  __kmp_print_thread_storage_map(root_thread, gtid);
3729  }
3730  root_thread->th.th_info.ds.ds_gtid = gtid;
3731 #if OMPT_SUPPORT
3732  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3733 #endif
3734  root_thread->th.th_root = root;
3735  if (__kmp_env_consistency_check) {
3736  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3737  }
3738 #if USE_FAST_MEMORY
3739  __kmp_initialize_fast_memory(root_thread);
3740 #endif /* USE_FAST_MEMORY */
3741 
3742 #if KMP_USE_BGET
3743  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3744  __kmp_initialize_bget(root_thread);
3745 #endif
3746  __kmp_init_random(root_thread); // Initialize random number generator
3747  }
3748 
3749  /* setup the serial team held in reserve by the root thread */
3750  if (!root_thread->th.th_serial_team) {
3751  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3752  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3753  root_thread->th.th_serial_team = __kmp_allocate_team(
3754  root, 1, 1,
3755 #if OMPT_SUPPORT
3756  ompt_data_none, // root parallel id
3757 #endif
3758  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3759  }
3760  KMP_ASSERT(root_thread->th.th_serial_team);
3761  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3762  root_thread->th.th_serial_team));
3763 
3764  /* drop root_thread into place */
3765  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3766 
3767  root->r.r_root_team->t.t_threads[0] = root_thread;
3768  root->r.r_hot_team->t.t_threads[0] = root_thread;
3769  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3770  // AC: the team created in reserve, not for execution (it is unused for now).
3771  root_thread->th.th_serial_team->t.t_serialized = 0;
3772  root->r.r_uber_thread = root_thread;
3773 
3774  /* initialize the thread, get it ready to go */
3775  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3776  TCW_4(__kmp_init_gtid, TRUE);
3777 
3778  /* prepare the master thread for get_gtid() */
3779  __kmp_gtid_set_specific(gtid);
3780 
3781 #if USE_ITT_BUILD
3782  __kmp_itt_thread_name(gtid);
3783 #endif /* USE_ITT_BUILD */
3784 
3785 #ifdef KMP_TDATA_GTID
3786  __kmp_gtid = gtid;
3787 #endif
3788  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3789  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3790 
3791  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3792  "plain=%u\n",
3793  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3794  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3795  KMP_INIT_BARRIER_STATE));
3796  { // Initialize barrier data.
3797  int b;
3798  for (b = 0; b < bs_last_barrier; ++b) {
3799  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3800 #if USE_DEBUGGER
3801  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3802 #endif
3803  }
3804  }
3805  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3806  KMP_INIT_BARRIER_STATE);
3807 
3808 #if KMP_AFFINITY_SUPPORTED
3809  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3810  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3811  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3812  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3813  if (TCR_4(__kmp_init_middle)) {
3814  __kmp_affinity_set_init_mask(gtid, TRUE);
3815  }
3816 #endif /* KMP_AFFINITY_SUPPORTED */
3817  root_thread->th.th_def_allocator = __kmp_def_allocator;
3818  root_thread->th.th_prev_level = 0;
3819  root_thread->th.th_prev_num_threads = 1;
3820 
3821  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3822  tmp->cg_root = root_thread;
3823  tmp->cg_thread_limit = __kmp_cg_max_nth;
3824  tmp->cg_nthreads = 1;
3825  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3826  " cg_nthreads init to 1\n",
3827  root_thread, tmp));
3828  tmp->up = NULL;
3829  root_thread->th.th_cg_roots = tmp;
3830 
3831  __kmp_root_counter++;
3832 
3833 #if OMPT_SUPPORT
3834  if (!initial_thread && ompt_enabled.enabled) {
3835 
3836  kmp_info_t *root_thread = ompt_get_thread();
3837 
3838  ompt_set_thread_state(root_thread, ompt_state_overhead);
3839 
3840  if (ompt_enabled.ompt_callback_thread_begin) {
3841  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3842  ompt_thread_initial, __ompt_get_thread_data_internal());
3843  }
3844  ompt_data_t *task_data;
3845  ompt_data_t *parallel_data;
3846  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3847  if (ompt_enabled.ompt_callback_implicit_task) {
3848  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3849  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3850  }
3851 
3852  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3853  }
3854 #endif
3855 
3856  KMP_MB();
3857  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3858 
3859  return gtid;
3860 }
3861 
3862 #if KMP_NESTED_HOT_TEAMS
3863 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3864  const int max_level) {
3865  int i, n, nth;
3866  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3867  if (!hot_teams || !hot_teams[level].hot_team) {
3868  return 0;
3869  }
3870  KMP_DEBUG_ASSERT(level < max_level);
3871  kmp_team_t *team = hot_teams[level].hot_team;
3872  nth = hot_teams[level].hot_team_nth;
3873  n = nth - 1; // master is not freed
3874  if (level < max_level - 1) {
3875  for (i = 0; i < nth; ++i) {
3876  kmp_info_t *th = team->t.t_threads[i];
3877  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3878  if (i > 0 && th->th.th_hot_teams) {
3879  __kmp_free(th->th.th_hot_teams);
3880  th->th.th_hot_teams = NULL;
3881  }
3882  }
3883  }
3884  __kmp_free_team(root, team, NULL);
3885  return n;
3886 }
3887 #endif
3888 
3889 // Resets a root thread and clear its root and hot teams.
3890 // Returns the number of __kmp_threads entries directly and indirectly freed.
3891 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3892  kmp_team_t *root_team = root->r.r_root_team;
3893  kmp_team_t *hot_team = root->r.r_hot_team;
3894  int n = hot_team->t.t_nproc;
3895  int i;
3896 
3897  KMP_DEBUG_ASSERT(!root->r.r_active);
3898 
3899  root->r.r_root_team = NULL;
3900  root->r.r_hot_team = NULL;
3901  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3902  // before call to __kmp_free_team().
3903  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3904 #if KMP_NESTED_HOT_TEAMS
3905  if (__kmp_hot_teams_max_level >
3906  0) { // need to free nested hot teams and their threads if any
3907  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3908  kmp_info_t *th = hot_team->t.t_threads[i];
3909  if (__kmp_hot_teams_max_level > 1) {
3910  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3911  }
3912  if (th->th.th_hot_teams) {
3913  __kmp_free(th->th.th_hot_teams);
3914  th->th.th_hot_teams = NULL;
3915  }
3916  }
3917  }
3918 #endif
3919  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3920 
3921  // Before we can reap the thread, we need to make certain that all other
3922  // threads in the teams that had this root as ancestor have stopped trying to
3923  // steal tasks.
3924  if (__kmp_tasking_mode != tskm_immediate_exec) {
3925  __kmp_wait_to_unref_task_teams();
3926  }
3927 
3928 #if KMP_OS_WINDOWS
3929  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3930  KA_TRACE(
3931  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3932  "\n",
3933  (LPVOID) & (root->r.r_uber_thread->th),
3934  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3935  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3936 #endif /* KMP_OS_WINDOWS */
3937 
3938 #if OMPT_SUPPORT
3939  ompt_data_t *task_data;
3940  ompt_data_t *parallel_data;
3941  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3942  if (ompt_enabled.ompt_callback_implicit_task) {
3943  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3944  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3945  }
3946  if (ompt_enabled.ompt_callback_thread_end) {
3947  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3948  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3949  }
3950 #endif
3951 
3952  TCW_4(__kmp_nth,
3953  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3954  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3955  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3956  " to %d\n",
3957  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3958  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3959  if (i == 1) {
3960  // need to free contention group structure
3961  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3962  root->r.r_uber_thread->th.th_cg_roots->cg_root);
3963  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3964  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3965  root->r.r_uber_thread->th.th_cg_roots = NULL;
3966  }
3967  __kmp_reap_thread(root->r.r_uber_thread, 1);
3968 
3969  // We canot put root thread to __kmp_thread_pool, so we have to reap it
3970  // instead of freeing.
3971  root->r.r_uber_thread = NULL;
3972  /* mark root as no longer in use */
3973  root->r.r_begin = FALSE;
3974 
3975  return n;
3976 }
3977 
3978 void __kmp_unregister_root_current_thread(int gtid) {
3979  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3980  /* this lock should be ok, since unregister_root_current_thread is never
3981  called during an abort, only during a normal close. furthermore, if you
3982  have the forkjoin lock, you should never try to get the initz lock */
3983  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3984  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3985  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3986  "exiting T#%d\n",
3987  gtid));
3988  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3989  return;
3990  }
3991  kmp_root_t *root = __kmp_root[gtid];
3992 
3993  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3994  KMP_ASSERT(KMP_UBER_GTID(gtid));
3995  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3996  KMP_ASSERT(root->r.r_active == FALSE);
3997 
3998  KMP_MB();
3999 
4000  kmp_info_t *thread = __kmp_threads[gtid];
4001  kmp_team_t *team = thread->th.th_team;
4002  kmp_task_team_t *task_team = thread->th.th_task_team;
4003 
4004  // we need to wait for the proxy tasks before finishing the thread
4005  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4006 #if OMPT_SUPPORT
4007  // the runtime is shutting down so we won't report any events
4008  thread->th.ompt_thread_info.state = ompt_state_undefined;
4009 #endif
4010  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4011  }
4012 
4013  __kmp_reset_root(gtid, root);
4014 
4015  KMP_MB();
4016  KC_TRACE(10,
4017  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4018 
4019  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4020 }
4021 
4022 #if KMP_OS_WINDOWS
4023 /* __kmp_forkjoin_lock must be already held
4024  Unregisters a root thread that is not the current thread. Returns the number
4025  of __kmp_threads entries freed as a result. */
4026 static int __kmp_unregister_root_other_thread(int gtid) {
4027  kmp_root_t *root = __kmp_root[gtid];
4028  int r;
4029 
4030  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4031  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4032  KMP_ASSERT(KMP_UBER_GTID(gtid));
4033  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4034  KMP_ASSERT(root->r.r_active == FALSE);
4035 
4036  r = __kmp_reset_root(gtid, root);
4037  KC_TRACE(10,
4038  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4039  return r;
4040 }
4041 #endif
4042 
4043 #if KMP_DEBUG
4044 void __kmp_task_info() {
4045 
4046  kmp_int32 gtid = __kmp_entry_gtid();
4047  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4048  kmp_info_t *this_thr = __kmp_threads[gtid];
4049  kmp_team_t *steam = this_thr->th.th_serial_team;
4050  kmp_team_t *team = this_thr->th.th_team;
4051 
4052  __kmp_printf(
4053  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4054  "ptask=%p\n",
4055  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4056  team->t.t_implicit_task_taskdata[tid].td_parent);
4057 }
4058 #endif // KMP_DEBUG
4059 
4060 /* TODO optimize with one big memclr, take out what isn't needed, split
4061  responsibility to workers as much as possible, and delay initialization of
4062  features as much as possible */
4063 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4064  int tid, int gtid) {
4065  /* this_thr->th.th_info.ds.ds_gtid is setup in
4066  kmp_allocate_thread/create_worker.
4067  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4068  kmp_info_t *master = team->t.t_threads[0];
4069  KMP_DEBUG_ASSERT(this_thr != NULL);
4070  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4071  KMP_DEBUG_ASSERT(team);
4072  KMP_DEBUG_ASSERT(team->t.t_threads);
4073  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4074  KMP_DEBUG_ASSERT(master);
4075  KMP_DEBUG_ASSERT(master->th.th_root);
4076 
4077  KMP_MB();
4078 
4079  TCW_SYNC_PTR(this_thr->th.th_team, team);
4080 
4081  this_thr->th.th_info.ds.ds_tid = tid;
4082  this_thr->th.th_set_nproc = 0;
4083  if (__kmp_tasking_mode != tskm_immediate_exec)
4084  // When tasking is possible, threads are not safe to reap until they are
4085  // done tasking; this will be set when tasking code is exited in wait
4086  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4087  else // no tasking --> always safe to reap
4088  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4089  this_thr->th.th_set_proc_bind = proc_bind_default;
4090 #if KMP_AFFINITY_SUPPORTED
4091  this_thr->th.th_new_place = this_thr->th.th_current_place;
4092 #endif
4093  this_thr->th.th_root = master->th.th_root;
4094 
4095  /* setup the thread's cache of the team structure */
4096  this_thr->th.th_team_nproc = team->t.t_nproc;
4097  this_thr->th.th_team_master = master;
4098  this_thr->th.th_team_serialized = team->t.t_serialized;
4099  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4100 
4101  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4102 
4103  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4104  tid, gtid, this_thr, this_thr->th.th_current_task));
4105 
4106  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4107  team, tid, TRUE);
4108 
4109  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4110  tid, gtid, this_thr, this_thr->th.th_current_task));
4111  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4112  // __kmp_initialize_team()?
4113 
4114  /* TODO no worksharing in speculative threads */
4115  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4116 
4117  this_thr->th.th_local.this_construct = 0;
4118 
4119  if (!this_thr->th.th_pri_common) {
4120  this_thr->th.th_pri_common =
4121  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4122  if (__kmp_storage_map) {
4123  __kmp_print_storage_map_gtid(
4124  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4125  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4126  }
4127  this_thr->th.th_pri_head = NULL;
4128  }
4129 
4130  if (this_thr != master && // Master's CG root is initialized elsewhere
4131  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4132  // Make new thread's CG root same as master's
4133  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4134  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4135  if (tmp) {
4136  // worker changes CG, need to check if old CG should be freed
4137  int i = tmp->cg_nthreads--;
4138  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4139  " on node %p of thread %p to %d\n",
4140  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4141  if (i == 1) {
4142  __kmp_free(tmp); // last thread left CG --> free it
4143  }
4144  }
4145  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4146  // Increment new thread's CG root's counter to add the new thread
4147  this_thr->th.th_cg_roots->cg_nthreads++;
4148  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4149  " node %p of thread %p to %d\n",
4150  this_thr, this_thr->th.th_cg_roots,
4151  this_thr->th.th_cg_roots->cg_root,
4152  this_thr->th.th_cg_roots->cg_nthreads));
4153  this_thr->th.th_current_task->td_icvs.thread_limit =
4154  this_thr->th.th_cg_roots->cg_thread_limit;
4155  }
4156 
4157  /* Initialize dynamic dispatch */
4158  {
4159  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4160  // Use team max_nproc since this will never change for the team.
4161  size_t disp_size =
4162  sizeof(dispatch_private_info_t) *
4163  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4164  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4165  team->t.t_max_nproc));
4166  KMP_ASSERT(dispatch);
4167  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4168  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4169 
4170  dispatch->th_disp_index = 0;
4171  dispatch->th_doacross_buf_idx = 0;
4172  if (!dispatch->th_disp_buffer) {
4173  dispatch->th_disp_buffer =
4174  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4175 
4176  if (__kmp_storage_map) {
4177  __kmp_print_storage_map_gtid(
4178  gtid, &dispatch->th_disp_buffer[0],
4179  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4180  ? 1
4181  : __kmp_dispatch_num_buffers],
4182  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4183  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4184  gtid, team->t.t_id, gtid);
4185  }
4186  } else {
4187  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4188  }
4189 
4190  dispatch->th_dispatch_pr_current = 0;
4191  dispatch->th_dispatch_sh_current = 0;
4192 
4193  dispatch->th_deo_fcn = 0; /* ORDERED */
4194  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4195  }
4196 
4197  this_thr->th.th_next_pool = NULL;
4198 
4199  if (!this_thr->th.th_task_state_memo_stack) {
4200  size_t i;
4201  this_thr->th.th_task_state_memo_stack =
4202  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4203  this_thr->th.th_task_state_top = 0;
4204  this_thr->th.th_task_state_stack_sz = 4;
4205  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4206  ++i) // zero init the stack
4207  this_thr->th.th_task_state_memo_stack[i] = 0;
4208  }
4209 
4210  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4211  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4212 
4213  KMP_MB();
4214 }
4215 
4216 /* allocate a new thread for the requesting team. this is only called from
4217  within a forkjoin critical section. we will first try to get an available
4218  thread from the thread pool. if none is available, we will fork a new one
4219  assuming we are able to create a new one. this should be assured, as the
4220  caller should check on this first. */
4221 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4222  int new_tid) {
4223  kmp_team_t *serial_team;
4224  kmp_info_t *new_thr;
4225  int new_gtid;
4226 
4227  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4228  KMP_DEBUG_ASSERT(root && team);
4229 #if !KMP_NESTED_HOT_TEAMS
4230  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4231 #endif
4232  KMP_MB();
4233 
4234  /* first, try to get one from the thread pool */
4235  if (__kmp_thread_pool) {
4236  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4237  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4238  if (new_thr == __kmp_thread_pool_insert_pt) {
4239  __kmp_thread_pool_insert_pt = NULL;
4240  }
4241  TCW_4(new_thr->th.th_in_pool, FALSE);
4242  __kmp_suspend_initialize_thread(new_thr);
4243  __kmp_lock_suspend_mx(new_thr);
4244  if (new_thr->th.th_active_in_pool == TRUE) {
4245  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4246  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4247  new_thr->th.th_active_in_pool = FALSE;
4248  }
4249  __kmp_unlock_suspend_mx(new_thr);
4250 
4251  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4252  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4253  KMP_ASSERT(!new_thr->th.th_team);
4254  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4255 
4256  /* setup the thread structure */
4257  __kmp_initialize_info(new_thr, team, new_tid,
4258  new_thr->th.th_info.ds.ds_gtid);
4259  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4260 
4261  TCW_4(__kmp_nth, __kmp_nth + 1);
4262 
4263  new_thr->th.th_task_state = 0;
4264  new_thr->th.th_task_state_top = 0;
4265  new_thr->th.th_task_state_stack_sz = 4;
4266 
4267 #ifdef KMP_ADJUST_BLOCKTIME
4268  /* Adjust blocktime back to zero if necessary */
4269  /* Middle initialization might not have occurred yet */
4270  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4271  if (__kmp_nth > __kmp_avail_proc) {
4272  __kmp_zero_bt = TRUE;
4273  }
4274  }
4275 #endif /* KMP_ADJUST_BLOCKTIME */
4276 
4277 #if KMP_DEBUG
4278  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4279  // KMP_BARRIER_PARENT_FLAG.
4280  int b;
4281  kmp_balign_t *balign = new_thr->th.th_bar;
4282  for (b = 0; b < bs_last_barrier; ++b)
4283  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4284 #endif
4285 
4286  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4287  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4288 
4289  KMP_MB();
4290  return new_thr;
4291  }
4292 
4293  /* no, well fork a new one */
4294  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4295  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4296 
4297 #if KMP_USE_MONITOR
4298  // If this is the first worker thread the RTL is creating, then also
4299  // launch the monitor thread. We try to do this as early as possible.
4300  if (!TCR_4(__kmp_init_monitor)) {
4301  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4302  if (!TCR_4(__kmp_init_monitor)) {
4303  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4304  TCW_4(__kmp_init_monitor, 1);
4305  __kmp_create_monitor(&__kmp_monitor);
4306  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4307 #if KMP_OS_WINDOWS
4308  // AC: wait until monitor has started. This is a fix for CQ232808.
4309  // The reason is that if the library is loaded/unloaded in a loop with
4310  // small (parallel) work in between, then there is high probability that
4311  // monitor thread started after the library shutdown. At shutdown it is
4312  // too late to cope with the problem, because when the master is in
4313  // DllMain (process detach) the monitor has no chances to start (it is
4314  // blocked), and master has no means to inform the monitor that the
4315  // library has gone, because all the memory which the monitor can access
4316  // is going to be released/reset.
4317  while (TCR_4(__kmp_init_monitor) < 2) {
4318  KMP_YIELD(TRUE);
4319  }
4320  KF_TRACE(10, ("after monitor thread has started\n"));
4321 #endif
4322  }
4323  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4324  }
4325 #endif
4326 
4327  KMP_MB();
4328 
4329  {
4330  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4331  ? 1
4332  : __kmp_hidden_helper_threads_num + 1;
4333 
4334  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4335  ++new_gtid) {
4336  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4337  }
4338 
4339  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4340  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4341  }
4342  }
4343 
4344  /* allocate space for it. */
4345  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4346 
4347  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4348 
4349 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4350  // suppress race conditions detection on synchronization flags in debug mode
4351  // this helps to analyze library internals eliminating false positives
4352  __itt_suppress_mark_range(
4353  __itt_suppress_range, __itt_suppress_threading_errors,
4354  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4355  __itt_suppress_mark_range(
4356  __itt_suppress_range, __itt_suppress_threading_errors,
4357  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4358 #if KMP_OS_WINDOWS
4359  __itt_suppress_mark_range(
4360  __itt_suppress_range, __itt_suppress_threading_errors,
4361  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4362 #else
4363  __itt_suppress_mark_range(__itt_suppress_range,
4364  __itt_suppress_threading_errors,
4365  &new_thr->th.th_suspend_init_count,
4366  sizeof(new_thr->th.th_suspend_init_count));
4367 #endif
4368  // TODO: check if we need to also suppress b_arrived flags
4369  __itt_suppress_mark_range(__itt_suppress_range,
4370  __itt_suppress_threading_errors,
4371  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4372  sizeof(new_thr->th.th_bar[0].bb.b_go));
4373  __itt_suppress_mark_range(__itt_suppress_range,
4374  __itt_suppress_threading_errors,
4375  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4376  sizeof(new_thr->th.th_bar[1].bb.b_go));
4377  __itt_suppress_mark_range(__itt_suppress_range,
4378  __itt_suppress_threading_errors,
4379  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4380  sizeof(new_thr->th.th_bar[2].bb.b_go));
4381 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4382  if (__kmp_storage_map) {
4383  __kmp_print_thread_storage_map(new_thr, new_gtid);
4384  }
4385 
4386  // add the reserve serialized team, initialized from the team's master thread
4387  {
4388  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4389  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4390  new_thr->th.th_serial_team = serial_team =
4391  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4392 #if OMPT_SUPPORT
4393  ompt_data_none, // root parallel id
4394 #endif
4395  proc_bind_default, &r_icvs,
4396  0 USE_NESTED_HOT_ARG(NULL));
4397  }
4398  KMP_ASSERT(serial_team);
4399  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4400  // execution (it is unused for now).
4401  serial_team->t.t_threads[0] = new_thr;
4402  KF_TRACE(10,
4403  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4404  new_thr));
4405 
4406  /* setup the thread structures */
4407  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4408 
4409 #if USE_FAST_MEMORY
4410  __kmp_initialize_fast_memory(new_thr);
4411 #endif /* USE_FAST_MEMORY */
4412 
4413 #if KMP_USE_BGET
4414  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4415  __kmp_initialize_bget(new_thr);
4416 #endif
4417 
4418  __kmp_init_random(new_thr); // Initialize random number generator
4419 
4420  /* Initialize these only once when thread is grabbed for a team allocation */
4421  KA_TRACE(20,
4422  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4423  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4424 
4425  int b;
4426  kmp_balign_t *balign = new_thr->th.th_bar;
4427  for (b = 0; b < bs_last_barrier; ++b) {
4428  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4429  balign[b].bb.team = NULL;
4430  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4431  balign[b].bb.use_oncore_barrier = 0;
4432  }
4433 
4434  new_thr->th.th_spin_here = FALSE;
4435  new_thr->th.th_next_waiting = 0;
4436 #if KMP_OS_UNIX
4437  new_thr->th.th_blocking = false;
4438 #endif
4439 
4440 #if KMP_AFFINITY_SUPPORTED
4441  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4442  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4443  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4444  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4445 #endif
4446  new_thr->th.th_def_allocator = __kmp_def_allocator;
4447  new_thr->th.th_prev_level = 0;
4448  new_thr->th.th_prev_num_threads = 1;
4449 
4450  TCW_4(new_thr->th.th_in_pool, FALSE);
4451  new_thr->th.th_active_in_pool = FALSE;
4452  TCW_4(new_thr->th.th_active, TRUE);
4453 
4454  /* adjust the global counters */
4455  __kmp_all_nth++;
4456  __kmp_nth++;
4457 
4458  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4459  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4460  if (__kmp_adjust_gtid_mode) {
4461  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4462  if (TCR_4(__kmp_gtid_mode) != 2) {
4463  TCW_4(__kmp_gtid_mode, 2);
4464  }
4465  } else {
4466  if (TCR_4(__kmp_gtid_mode) != 1) {
4467  TCW_4(__kmp_gtid_mode, 1);
4468  }
4469  }
4470  }
4471 
4472 #ifdef KMP_ADJUST_BLOCKTIME
4473  /* Adjust blocktime back to zero if necessary */
4474  /* Middle initialization might not have occurred yet */
4475  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4476  if (__kmp_nth > __kmp_avail_proc) {
4477  __kmp_zero_bt = TRUE;
4478  }
4479  }
4480 #endif /* KMP_ADJUST_BLOCKTIME */
4481 
4482  /* actually fork it and create the new worker thread */
4483  KF_TRACE(
4484  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4485  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4486  KF_TRACE(10,
4487  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4488 
4489  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4490  new_gtid));
4491  KMP_MB();
4492  return new_thr;
4493 }
4494 
4495 /* Reinitialize team for reuse.
4496  The hot team code calls this case at every fork barrier, so EPCC barrier
4497  test are extremely sensitive to changes in it, esp. writes to the team
4498  struct, which cause a cache invalidation in all threads.
4499  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4500 static void __kmp_reinitialize_team(kmp_team_t *team,
4501  kmp_internal_control_t *new_icvs,
4502  ident_t *loc) {
4503  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4504  team->t.t_threads[0], team));
4505  KMP_DEBUG_ASSERT(team && new_icvs);
4506  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4507  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4508 
4509  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4510  // Copy ICVs to the master thread's implicit taskdata
4511  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4512  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4513 
4514  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4515  team->t.t_threads[0], team));
4516 }
4517 
4518 /* Initialize the team data structure.
4519  This assumes the t_threads and t_max_nproc are already set.
4520  Also, we don't touch the arguments */
4521 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4522  kmp_internal_control_t *new_icvs,
4523  ident_t *loc) {
4524  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4525 
4526  /* verify */
4527  KMP_DEBUG_ASSERT(team);
4528  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4529  KMP_DEBUG_ASSERT(team->t.t_threads);
4530  KMP_MB();
4531 
4532  team->t.t_master_tid = 0; /* not needed */
4533  /* team->t.t_master_bar; not needed */
4534  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4535  team->t.t_nproc = new_nproc;
4536 
4537  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4538  team->t.t_next_pool = NULL;
4539  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4540  * up hot team */
4541 
4542  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4543  team->t.t_invoke = NULL; /* not needed */
4544 
4545  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4546  team->t.t_sched.sched = new_icvs->sched.sched;
4547 
4548 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4549  team->t.t_fp_control_saved = FALSE; /* not needed */
4550  team->t.t_x87_fpu_control_word = 0; /* not needed */
4551  team->t.t_mxcsr = 0; /* not needed */
4552 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4553 
4554  team->t.t_construct = 0;
4555 
4556  team->t.t_ordered.dt.t_value = 0;
4557  team->t.t_master_active = FALSE;
4558 
4559 #ifdef KMP_DEBUG
4560  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4561 #endif
4562 #if KMP_OS_WINDOWS
4563  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4564 #endif
4565 
4566  team->t.t_control_stack_top = NULL;
4567 
4568  __kmp_reinitialize_team(team, new_icvs, loc);
4569 
4570  KMP_MB();
4571  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4572 }
4573 
4574 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4575 /* Sets full mask for thread and returns old mask, no changes to structures. */
4576 static void
4577 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4578  if (KMP_AFFINITY_CAPABLE()) {
4579  int status;
4580  if (old_mask != NULL) {
4581  status = __kmp_get_system_affinity(old_mask, TRUE);
4582  int error = errno;
4583  if (status != 0) {
4584  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4585  __kmp_msg_null);
4586  }
4587  }
4588  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4589  }
4590 }
4591 #endif
4592 
4593 #if KMP_AFFINITY_SUPPORTED
4594 
4595 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4596 // It calculates the worker + master thread's partition based upon the parent
4597 // thread's partition, and binds each worker to a thread in their partition.
4598 // The master thread's partition should already include its current binding.
4599 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4600  // Copy the master thread's place partition to the team struct
4601  kmp_info_t *master_th = team->t.t_threads[0];
4602  KMP_DEBUG_ASSERT(master_th != NULL);
4603  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4604  int first_place = master_th->th.th_first_place;
4605  int last_place = master_th->th.th_last_place;
4606  int masters_place = master_th->th.th_current_place;
4607  team->t.t_first_place = first_place;
4608  team->t.t_last_place = last_place;
4609 
4610  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4611  "bound to place %d partition = [%d,%d]\n",
4612  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4613  team->t.t_id, masters_place, first_place, last_place));
4614 
4615  switch (proc_bind) {
4616 
4617  case proc_bind_default:
4618  // serial teams might have the proc_bind policy set to proc_bind_default. It
4619  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4620  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4621  break;
4622 
4623  case proc_bind_master: {
4624  int f;
4625  int n_th = team->t.t_nproc;
4626  for (f = 1; f < n_th; f++) {
4627  kmp_info_t *th = team->t.t_threads[f];
4628  KMP_DEBUG_ASSERT(th != NULL);
4629  th->th.th_first_place = first_place;
4630  th->th.th_last_place = last_place;
4631  th->th.th_new_place = masters_place;
4632  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4633  team->t.t_display_affinity != 1) {
4634  team->t.t_display_affinity = 1;
4635  }
4636 
4637  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4638  "partition = [%d,%d]\n",
4639  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4640  f, masters_place, first_place, last_place));
4641  }
4642  } break;
4643 
4644  case proc_bind_close: {
4645  int f;
4646  int n_th = team->t.t_nproc;
4647  int n_places;
4648  if (first_place <= last_place) {
4649  n_places = last_place - first_place + 1;
4650  } else {
4651  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4652  }
4653  if (n_th <= n_places) {
4654  int place = masters_place;
4655  for (f = 1; f < n_th; f++) {
4656  kmp_info_t *th = team->t.t_threads[f];
4657  KMP_DEBUG_ASSERT(th != NULL);
4658 
4659  if (place == last_place) {
4660  place = first_place;
4661  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4662  place = 0;
4663  } else {
4664  place++;
4665  }
4666  th->th.th_first_place = first_place;
4667  th->th.th_last_place = last_place;
4668  th->th.th_new_place = place;
4669  if (__kmp_display_affinity && place != th->th.th_current_place &&
4670  team->t.t_display_affinity != 1) {
4671  team->t.t_display_affinity = 1;
4672  }
4673 
4674  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4675  "partition = [%d,%d]\n",
4676  __kmp_gtid_from_thread(team->t.t_threads[f]),
4677  team->t.t_id, f, place, first_place, last_place));
4678  }
4679  } else {
4680  int S, rem, gap, s_count;
4681  S = n_th / n_places;
4682  s_count = 0;
4683  rem = n_th - (S * n_places);
4684  gap = rem > 0 ? n_places / rem : n_places;
4685  int place = masters_place;
4686  int gap_ct = gap;
4687  for (f = 0; f < n_th; f++) {
4688  kmp_info_t *th = team->t.t_threads[f];
4689  KMP_DEBUG_ASSERT(th != NULL);
4690 
4691  th->th.th_first_place = first_place;
4692  th->th.th_last_place = last_place;
4693  th->th.th_new_place = place;
4694  if (__kmp_display_affinity && place != th->th.th_current_place &&
4695  team->t.t_display_affinity != 1) {
4696  team->t.t_display_affinity = 1;
4697  }
4698  s_count++;
4699 
4700  if ((s_count == S) && rem && (gap_ct == gap)) {
4701  // do nothing, add an extra thread to place on next iteration
4702  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4703  // we added an extra thread to this place; move to next place
4704  if (place == last_place) {
4705  place = first_place;
4706  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4707  place = 0;
4708  } else {
4709  place++;
4710  }
4711  s_count = 0;
4712  gap_ct = 1;
4713  rem--;
4714  } else if (s_count == S) { // place full; don't add extra
4715  if (place == last_place) {
4716  place = first_place;
4717  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4718  place = 0;
4719  } else {
4720  place++;
4721  }
4722  gap_ct++;
4723  s_count = 0;
4724  }
4725 
4726  KA_TRACE(100,
4727  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4728  "partition = [%d,%d]\n",
4729  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4730  th->th.th_new_place, first_place, last_place));
4731  }
4732  KMP_DEBUG_ASSERT(place == masters_place);
4733  }
4734  } break;
4735 
4736  case proc_bind_spread: {
4737  int f;
4738  int n_th = team->t.t_nproc;
4739  int n_places;
4740  int thidx;
4741  if (first_place <= last_place) {
4742  n_places = last_place - first_place + 1;
4743  } else {
4744  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4745  }
4746  if (n_th <= n_places) {
4747  int place = -1;
4748 
4749  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4750  int S = n_places / n_th;
4751  int s_count, rem, gap, gap_ct;
4752 
4753  place = masters_place;
4754  rem = n_places - n_th * S;
4755  gap = rem ? n_th / rem : 1;
4756  gap_ct = gap;
4757  thidx = n_th;
4758  if (update_master_only == 1)
4759  thidx = 1;
4760  for (f = 0; f < thidx; f++) {
4761  kmp_info_t *th = team->t.t_threads[f];
4762  KMP_DEBUG_ASSERT(th != NULL);
4763 
4764  th->th.th_first_place = place;
4765  th->th.th_new_place = place;
4766  if (__kmp_display_affinity && place != th->th.th_current_place &&
4767  team->t.t_display_affinity != 1) {
4768  team->t.t_display_affinity = 1;
4769  }
4770  s_count = 1;
4771  while (s_count < S) {
4772  if (place == last_place) {
4773  place = first_place;
4774  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4775  place = 0;
4776  } else {
4777  place++;
4778  }
4779  s_count++;
4780  }
4781  if (rem && (gap_ct == gap)) {
4782  if (place == last_place) {
4783  place = first_place;
4784  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4785  place = 0;
4786  } else {
4787  place++;
4788  }
4789  rem--;
4790  gap_ct = 0;
4791  }
4792  th->th.th_last_place = place;
4793  gap_ct++;
4794 
4795  if (place == last_place) {
4796  place = first_place;
4797  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4798  place = 0;
4799  } else {
4800  place++;
4801  }
4802 
4803  KA_TRACE(100,
4804  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4805  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4806  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4807  f, th->th.th_new_place, th->th.th_first_place,
4808  th->th.th_last_place, __kmp_affinity_num_masks));
4809  }
4810  } else {
4811  /* Having uniform space of available computation places I can create
4812  T partitions of round(P/T) size and put threads into the first
4813  place of each partition. */
4814  double current = static_cast<double>(masters_place);
4815  double spacing =
4816  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4817  int first, last;
4818  kmp_info_t *th;
4819 
4820  thidx = n_th + 1;
4821  if (update_master_only == 1)
4822  thidx = 1;
4823  for (f = 0; f < thidx; f++) {
4824  first = static_cast<int>(current);
4825  last = static_cast<int>(current + spacing) - 1;
4826  KMP_DEBUG_ASSERT(last >= first);
4827  if (first >= n_places) {
4828  if (masters_place) {
4829  first -= n_places;
4830  last -= n_places;
4831  if (first == (masters_place + 1)) {
4832  KMP_DEBUG_ASSERT(f == n_th);
4833  first--;
4834  }
4835  if (last == masters_place) {
4836  KMP_DEBUG_ASSERT(f == (n_th - 1));
4837  last--;
4838  }
4839  } else {
4840  KMP_DEBUG_ASSERT(f == n_th);
4841  first = 0;
4842  last = 0;
4843  }
4844  }
4845  if (last >= n_places) {
4846  last = (n_places - 1);
4847  }
4848  place = first;
4849  current += spacing;
4850  if (f < n_th) {
4851  KMP_DEBUG_ASSERT(0 <= first);
4852  KMP_DEBUG_ASSERT(n_places > first);
4853  KMP_DEBUG_ASSERT(0 <= last);
4854  KMP_DEBUG_ASSERT(n_places > last);
4855  KMP_DEBUG_ASSERT(last_place >= first_place);
4856  th = team->t.t_threads[f];
4857  KMP_DEBUG_ASSERT(th);
4858  th->th.th_first_place = first;
4859  th->th.th_new_place = place;
4860  th->th.th_last_place = last;
4861  if (__kmp_display_affinity && place != th->th.th_current_place &&
4862  team->t.t_display_affinity != 1) {
4863  team->t.t_display_affinity = 1;
4864  }
4865  KA_TRACE(100,
4866  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4867  "partition = [%d,%d], spacing = %.4f\n",
4868  __kmp_gtid_from_thread(team->t.t_threads[f]),
4869  team->t.t_id, f, th->th.th_new_place,
4870  th->th.th_first_place, th->th.th_last_place, spacing));
4871  }
4872  }
4873  }
4874  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4875  } else {
4876  int S, rem, gap, s_count;
4877  S = n_th / n_places;
4878  s_count = 0;
4879  rem = n_th - (S * n_places);
4880  gap = rem > 0 ? n_places / rem : n_places;
4881  int place = masters_place;
4882  int gap_ct = gap;
4883  thidx = n_th;
4884  if (update_master_only == 1)
4885  thidx = 1;
4886  for (f = 0; f < thidx; f++) {
4887  kmp_info_t *th = team->t.t_threads[f];
4888  KMP_DEBUG_ASSERT(th != NULL);
4889 
4890  th->th.th_first_place = place;
4891  th->th.th_last_place = place;
4892  th->th.th_new_place = place;
4893  if (__kmp_display_affinity && place != th->th.th_current_place &&
4894  team->t.t_display_affinity != 1) {
4895  team->t.t_display_affinity = 1;
4896  }
4897  s_count++;
4898 
4899  if ((s_count == S) && rem && (gap_ct == gap)) {
4900  // do nothing, add an extra thread to place on next iteration
4901  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4902  // we added an extra thread to this place; move on to next place
4903  if (place == last_place) {
4904  place = first_place;
4905  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4906  place = 0;
4907  } else {
4908  place++;
4909  }
4910  s_count = 0;
4911  gap_ct = 1;
4912  rem--;
4913  } else if (s_count == S) { // place is full; don't add extra thread
4914  if (place == last_place) {
4915  place = first_place;
4916  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4917  place = 0;
4918  } else {
4919  place++;
4920  }
4921  gap_ct++;
4922  s_count = 0;
4923  }
4924 
4925  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4926  "partition = [%d,%d]\n",
4927  __kmp_gtid_from_thread(team->t.t_threads[f]),
4928  team->t.t_id, f, th->th.th_new_place,
4929  th->th.th_first_place, th->th.th_last_place));
4930  }
4931  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4932  }
4933  } break;
4934 
4935  default:
4936  break;
4937  }
4938 
4939  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4940 }
4941 
4942 #endif // KMP_AFFINITY_SUPPORTED
4943 
4944 /* allocate a new team data structure to use. take one off of the free pool if
4945  available */
4946 kmp_team_t *
4947 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4948 #if OMPT_SUPPORT
4949  ompt_data_t ompt_parallel_data,
4950 #endif
4951  kmp_proc_bind_t new_proc_bind,
4952  kmp_internal_control_t *new_icvs,
4953  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4954  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4955  int f;
4956  kmp_team_t *team;
4957  int use_hot_team = !root->r.r_active;
4958  int level = 0;
4959 
4960  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4961  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4962  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4963  KMP_MB();
4964 
4965 #if KMP_NESTED_HOT_TEAMS
4966  kmp_hot_team_ptr_t *hot_teams;
4967  if (master) {
4968  team = master->th.th_team;
4969  level = team->t.t_active_level;
4970  if (master->th.th_teams_microtask) { // in teams construct?
4971  if (master->th.th_teams_size.nteams > 1 &&
4972  ( // #teams > 1
4973  team->t.t_pkfn ==
4974  (microtask_t)__kmp_teams_master || // inner fork of the teams
4975  master->th.th_teams_level <
4976  team->t.t_level)) { // or nested parallel inside the teams
4977  ++level; // not increment if #teams==1, or for outer fork of the teams;
4978  // increment otherwise
4979  }
4980  }
4981  hot_teams = master->th.th_hot_teams;
4982  if (level < __kmp_hot_teams_max_level && hot_teams &&
4983  hot_teams[level].hot_team) {
4984  // hot team has already been allocated for given level
4985  use_hot_team = 1;
4986  } else {
4987  use_hot_team = 0;
4988  }
4989  } else {
4990  // check we won't access uninitialized hot_teams, just in case
4991  KMP_DEBUG_ASSERT(new_nproc == 1);
4992  }
4993 #endif
4994  // Optimization to use a "hot" team
4995  if (use_hot_team && new_nproc > 1) {
4996  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4997 #if KMP_NESTED_HOT_TEAMS
4998  team = hot_teams[level].hot_team;
4999 #else
5000  team = root->r.r_hot_team;
5001 #endif
5002 #if KMP_DEBUG
5003  if (__kmp_tasking_mode != tskm_immediate_exec) {
5004  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5005  "task_team[1] = %p before reinit\n",
5006  team->t.t_task_team[0], team->t.t_task_team[1]));
5007  }
5008 #endif
5009 
5010  // Has the number of threads changed?
5011  /* Let's assume the most common case is that the number of threads is
5012  unchanged, and put that case first. */
5013  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5014  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5015  // This case can mean that omp_set_num_threads() was called and the hot
5016  // team size was already reduced, so we check the special flag
5017  if (team->t.t_size_changed == -1) {
5018  team->t.t_size_changed = 1;
5019  } else {
5020  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5021  }
5022 
5023  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5024  kmp_r_sched_t new_sched = new_icvs->sched;
5025  // set master's schedule as new run-time schedule
5026  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5027 
5028  __kmp_reinitialize_team(team, new_icvs,
5029  root->r.r_uber_thread->th.th_ident);
5030 
5031  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5032  team->t.t_threads[0], team));
5033  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5034 
5035 #if KMP_AFFINITY_SUPPORTED
5036  if ((team->t.t_size_changed == 0) &&
5037  (team->t.t_proc_bind == new_proc_bind)) {
5038  if (new_proc_bind == proc_bind_spread) {
5039  __kmp_partition_places(
5040  team, 1); // add flag to update only master for spread
5041  }
5042  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5043  "proc_bind = %d, partition = [%d,%d]\n",
5044  team->t.t_id, new_proc_bind, team->t.t_first_place,
5045  team->t.t_last_place));
5046  } else {
5047  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5048  __kmp_partition_places(team);
5049  }
5050 #else
5051  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5052 #endif /* KMP_AFFINITY_SUPPORTED */
5053  } else if (team->t.t_nproc > new_nproc) {
5054  KA_TRACE(20,
5055  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5056  new_nproc));
5057 
5058  team->t.t_size_changed = 1;
5059 #if KMP_NESTED_HOT_TEAMS
5060  if (__kmp_hot_teams_mode == 0) {
5061  // AC: saved number of threads should correspond to team's value in this
5062  // mode, can be bigger in mode 1, when hot team has threads in reserve
5063  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5064  hot_teams[level].hot_team_nth = new_nproc;
5065 #endif // KMP_NESTED_HOT_TEAMS
5066  /* release the extra threads we don't need any more */
5067  for (f = new_nproc; f < team->t.t_nproc; f++) {
5068  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5069  if (__kmp_tasking_mode != tskm_immediate_exec) {
5070  // When decreasing team size, threads no longer in the team should
5071  // unref task team.
5072  team->t.t_threads[f]->th.th_task_team = NULL;
5073  }
5074  __kmp_free_thread(team->t.t_threads[f]);
5075  team->t.t_threads[f] = NULL;
5076  }
5077 #if KMP_NESTED_HOT_TEAMS
5078  } // (__kmp_hot_teams_mode == 0)
5079  else {
5080  // When keeping extra threads in team, switch threads to wait on own
5081  // b_go flag
5082  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5083  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5084  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5085  for (int b = 0; b < bs_last_barrier; ++b) {
5086  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5087  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5088  }
5089  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5090  }
5091  }
5092  }
5093 #endif // KMP_NESTED_HOT_TEAMS
5094  team->t.t_nproc = new_nproc;
5095  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5096  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5097  __kmp_reinitialize_team(team, new_icvs,
5098  root->r.r_uber_thread->th.th_ident);
5099 
5100  // Update remaining threads
5101  for (f = 0; f < new_nproc; ++f) {
5102  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5103  }
5104 
5105  // restore the current task state of the master thread: should be the
5106  // implicit task
5107  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5108  team->t.t_threads[0], team));
5109 
5110  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5111 
5112 #ifdef KMP_DEBUG
5113  for (f = 0; f < team->t.t_nproc; f++) {
5114  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5115  team->t.t_threads[f]->th.th_team_nproc ==
5116  team->t.t_nproc);
5117  }
5118 #endif
5119 
5120  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5121 #if KMP_AFFINITY_SUPPORTED
5122  __kmp_partition_places(team);
5123 #endif
5124  } else { // team->t.t_nproc < new_nproc
5125 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5126  kmp_affin_mask_t *old_mask;
5127  if (KMP_AFFINITY_CAPABLE()) {
5128  KMP_CPU_ALLOC(old_mask);
5129  }
5130 #endif
5131 
5132  KA_TRACE(20,
5133  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5134  new_nproc));
5135 
5136  team->t.t_size_changed = 1;
5137 
5138 #if KMP_NESTED_HOT_TEAMS
5139  int avail_threads = hot_teams[level].hot_team_nth;
5140  if (new_nproc < avail_threads)
5141  avail_threads = new_nproc;
5142  kmp_info_t **other_threads = team->t.t_threads;
5143  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5144  // Adjust barrier data of reserved threads (if any) of the team
5145  // Other data will be set in __kmp_initialize_info() below.
5146  int b;
5147  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5148  for (b = 0; b < bs_last_barrier; ++b) {
5149  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5150  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5151 #if USE_DEBUGGER
5152  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5153 #endif
5154  }
5155  }
5156  if (hot_teams[level].hot_team_nth >= new_nproc) {
5157  // we have all needed threads in reserve, no need to allocate any
5158  // this only possible in mode 1, cannot have reserved threads in mode 0
5159  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5160  team->t.t_nproc = new_nproc; // just get reserved threads involved
5161  } else {
5162  // we may have some threads in reserve, but not enough
5163  team->t.t_nproc =
5164  hot_teams[level]
5165  .hot_team_nth; // get reserved threads involved if any
5166  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5167 #endif // KMP_NESTED_HOT_TEAMS
5168  if (team->t.t_max_nproc < new_nproc) {
5169  /* reallocate larger arrays */
5170  __kmp_reallocate_team_arrays(team, new_nproc);
5171  __kmp_reinitialize_team(team, new_icvs, NULL);
5172  }
5173 
5174 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5175  /* Temporarily set full mask for master thread before creation of
5176  workers. The reason is that workers inherit the affinity from master,
5177  so if a lot of workers are created on the single core quickly, they
5178  don't get a chance to set their own affinity for a long time. */
5179  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5180 #endif
5181 
5182  /* allocate new threads for the hot team */
5183  for (f = team->t.t_nproc; f < new_nproc; f++) {
5184  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5185  KMP_DEBUG_ASSERT(new_worker);
5186  team->t.t_threads[f] = new_worker;
5187 
5188  KA_TRACE(20,
5189  ("__kmp_allocate_team: team %d init T#%d arrived: "
5190  "join=%llu, plain=%llu\n",
5191  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5192  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5193  team->t.t_bar[bs_plain_barrier].b_arrived));
5194 
5195  { // Initialize barrier data for new threads.
5196  int b;
5197  kmp_balign_t *balign = new_worker->th.th_bar;
5198  for (b = 0; b < bs_last_barrier; ++b) {
5199  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5200  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5201  KMP_BARRIER_PARENT_FLAG);
5202 #if USE_DEBUGGER
5203  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5204 #endif
5205  }
5206  }
5207  }
5208 
5209 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5210  if (KMP_AFFINITY_CAPABLE()) {
5211  /* Restore initial master thread's affinity mask */
5212  __kmp_set_system_affinity(old_mask, TRUE);
5213  KMP_CPU_FREE(old_mask);
5214  }
5215 #endif
5216 #if KMP_NESTED_HOT_TEAMS
5217  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5218 #endif // KMP_NESTED_HOT_TEAMS
5219  /* make sure everyone is syncronized */
5220  int old_nproc = team->t.t_nproc; // save old value and use to update only
5221  // new threads below
5222  __kmp_initialize_team(team, new_nproc, new_icvs,
5223  root->r.r_uber_thread->th.th_ident);
5224 
5225  /* reinitialize the threads */
5226  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5227  for (f = 0; f < team->t.t_nproc; ++f)
5228  __kmp_initialize_info(team->t.t_threads[f], team, f,
5229  __kmp_gtid_from_tid(f, team));
5230 
5231  if (level) { // set th_task_state for new threads in nested hot team
5232  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5233  // only need to set the th_task_state for the new threads. th_task_state
5234  // for master thread will not be accurate until after this in
5235  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5236  // correct value.
5237  for (f = old_nproc; f < team->t.t_nproc; ++f)
5238  team->t.t_threads[f]->th.th_task_state =
5239  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5240  } else { // set th_task_state for new threads in non-nested hot team
5241  kmp_uint8 old_state =
5242  team->t.t_threads[0]->th.th_task_state; // copy master's state
5243  for (f = old_nproc; f < team->t.t_nproc; ++f)
5244  team->t.t_threads[f]->th.th_task_state = old_state;
5245  }
5246 
5247 #ifdef KMP_DEBUG
5248  for (f = 0; f < team->t.t_nproc; ++f) {
5249  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5250  team->t.t_threads[f]->th.th_team_nproc ==
5251  team->t.t_nproc);
5252  }
5253 #endif
5254 
5255  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5256 #if KMP_AFFINITY_SUPPORTED
5257  __kmp_partition_places(team);
5258 #endif
5259  } // Check changes in number of threads
5260 
5261  kmp_info_t *master = team->t.t_threads[0];
5262  if (master->th.th_teams_microtask) {
5263  for (f = 1; f < new_nproc; ++f) {
5264  // propagate teams construct specific info to workers
5265  kmp_info_t *thr = team->t.t_threads[f];
5266  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5267  thr->th.th_teams_level = master->th.th_teams_level;
5268  thr->th.th_teams_size = master->th.th_teams_size;
5269  }
5270  }
5271 #if KMP_NESTED_HOT_TEAMS
5272  if (level) {
5273  // Sync barrier state for nested hot teams, not needed for outermost hot
5274  // team.
5275  for (f = 1; f < new_nproc; ++f) {
5276  kmp_info_t *thr = team->t.t_threads[f];
5277  int b;
5278  kmp_balign_t *balign = thr->th.th_bar;
5279  for (b = 0; b < bs_last_barrier; ++b) {
5280  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5281  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5282 #if USE_DEBUGGER
5283  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5284 #endif
5285  }
5286  }
5287  }
5288 #endif // KMP_NESTED_HOT_TEAMS
5289 
5290  /* reallocate space for arguments if necessary */
5291  __kmp_alloc_argv_entries(argc, team, TRUE);
5292  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5293  // The hot team re-uses the previous task team,
5294  // if untouched during the previous release->gather phase.
5295 
5296  KF_TRACE(10, (" hot_team = %p\n", team));
5297 
5298 #if KMP_DEBUG
5299  if (__kmp_tasking_mode != tskm_immediate_exec) {
5300  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5301  "task_team[1] = %p after reinit\n",
5302  team->t.t_task_team[0], team->t.t_task_team[1]));
5303  }
5304 #endif
5305 
5306 #if OMPT_SUPPORT
5307  __ompt_team_assign_id(team, ompt_parallel_data);
5308 #endif
5309 
5310  KMP_MB();
5311 
5312  return team;
5313  }
5314 
5315  /* next, let's try to take one from the team pool */
5316  KMP_MB();
5317  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5318  /* TODO: consider resizing undersized teams instead of reaping them, now
5319  that we have a resizing mechanism */
5320  if (team->t.t_max_nproc >= max_nproc) {
5321  /* take this team from the team pool */
5322  __kmp_team_pool = team->t.t_next_pool;
5323 
5324  /* setup the team for fresh use */
5325  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5326 
5327  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5328  "task_team[1] %p to NULL\n",
5329  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5330  team->t.t_task_team[0] = NULL;
5331  team->t.t_task_team[1] = NULL;
5332 
5333  /* reallocate space for arguments if necessary */
5334  __kmp_alloc_argv_entries(argc, team, TRUE);
5335  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5336 
5337  KA_TRACE(
5338  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5339  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5340  { // Initialize barrier data.
5341  int b;
5342  for (b = 0; b < bs_last_barrier; ++b) {
5343  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5344 #if USE_DEBUGGER
5345  team->t.t_bar[b].b_master_arrived = 0;
5346  team->t.t_bar[b].b_team_arrived = 0;
5347 #endif
5348  }
5349  }
5350 
5351  team->t.t_proc_bind = new_proc_bind;
5352 
5353  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5354  team->t.t_id));
5355 
5356 #if OMPT_SUPPORT
5357  __ompt_team_assign_id(team, ompt_parallel_data);
5358 #endif
5359 
5360  KMP_MB();
5361 
5362  return team;
5363  }
5364 
5365  /* reap team if it is too small, then loop back and check the next one */
5366  // not sure if this is wise, but, will be redone during the hot-teams
5367  // rewrite.
5368  /* TODO: Use technique to find the right size hot-team, don't reap them */
5369  team = __kmp_reap_team(team);
5370  __kmp_team_pool = team;
5371  }
5372 
5373  /* nothing available in the pool, no matter, make a new team! */
5374  KMP_MB();
5375  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5376 
5377  /* and set it up */
5378  team->t.t_max_nproc = max_nproc;
5379  /* NOTE well, for some reason allocating one big buffer and dividing it up
5380  seems to really hurt performance a lot on the P4, so, let's not use this */
5381  __kmp_allocate_team_arrays(team, max_nproc);
5382 
5383  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5384  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5385 
5386  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5387  "%p to NULL\n",
5388  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5389  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5390  // memory, no need to duplicate
5391  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5392  // memory, no need to duplicate
5393 
5394  if (__kmp_storage_map) {
5395  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5396  }
5397 
5398  /* allocate space for arguments */
5399  __kmp_alloc_argv_entries(argc, team, FALSE);
5400  team->t.t_argc = argc;
5401 
5402  KA_TRACE(20,
5403  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5404  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5405  { // Initialize barrier data.
5406  int b;
5407  for (b = 0; b < bs_last_barrier; ++b) {
5408  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5409 #if USE_DEBUGGER
5410  team->t.t_bar[b].b_master_arrived = 0;
5411  team->t.t_bar[b].b_team_arrived = 0;
5412 #endif
5413  }
5414  }
5415 
5416  team->t.t_proc_bind = new_proc_bind;
5417 
5418 #if OMPT_SUPPORT
5419  __ompt_team_assign_id(team, ompt_parallel_data);
5420  team->t.ompt_serialized_team_info = NULL;
5421 #endif
5422 
5423  KMP_MB();
5424 
5425  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5426  team->t.t_id));
5427 
5428  return team;
5429 }
5430 
5431 /* TODO implement hot-teams at all levels */
5432 /* TODO implement lazy thread release on demand (disband request) */
5433 
5434 /* free the team. return it to the team pool. release all the threads
5435  * associated with it */
5436 void __kmp_free_team(kmp_root_t *root,
5437  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5438  int f;
5439  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5440  team->t.t_id));
5441 
5442  /* verify state */
5443  KMP_DEBUG_ASSERT(root);
5444  KMP_DEBUG_ASSERT(team);
5445  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5446  KMP_DEBUG_ASSERT(team->t.t_threads);
5447 
5448  int use_hot_team = team == root->r.r_hot_team;
5449 #if KMP_NESTED_HOT_TEAMS
5450  int level;
5451  kmp_hot_team_ptr_t *hot_teams;
5452  if (master) {
5453  level = team->t.t_active_level - 1;
5454  if (master->th.th_teams_microtask) { // in teams construct?
5455  if (master->th.th_teams_size.nteams > 1) {
5456  ++level; // level was not increased in teams construct for
5457  // team_of_masters
5458  }
5459  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5460  master->th.th_teams_level == team->t.t_level) {
5461  ++level; // level was not increased in teams construct for
5462  // team_of_workers before the parallel
5463  } // team->t.t_level will be increased inside parallel
5464  }
5465  hot_teams = master->th.th_hot_teams;
5466  if (level < __kmp_hot_teams_max_level) {
5467  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5468  use_hot_team = 1;
5469  }
5470  }
5471 #endif // KMP_NESTED_HOT_TEAMS
5472 
5473  /* team is done working */
5474  TCW_SYNC_PTR(team->t.t_pkfn,
5475  NULL); // Important for Debugging Support Library.
5476 #if KMP_OS_WINDOWS
5477  team->t.t_copyin_counter = 0; // init counter for possible reuse
5478 #endif
5479  // Do not reset pointer to parent team to NULL for hot teams.
5480 
5481  /* if we are non-hot team, release our threads */
5482  if (!use_hot_team) {
5483  if (__kmp_tasking_mode != tskm_immediate_exec) {
5484  // Wait for threads to reach reapable state
5485  for (f = 1; f < team->t.t_nproc; ++f) {
5486  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5487  kmp_info_t *th = team->t.t_threads[f];
5488  volatile kmp_uint32 *state = &th->th.th_reap_state;
5489  while (*state != KMP_SAFE_TO_REAP) {
5490 #if KMP_OS_WINDOWS
5491  // On Windows a thread can be killed at any time, check this
5492  DWORD ecode;
5493  if (!__kmp_is_thread_alive(th, &ecode)) {
5494  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5495  break;
5496  }
5497 #endif
5498  // first check if thread is sleeping
5499  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5500  if (fl.is_sleeping())
5501  fl.resume(__kmp_gtid_from_thread(th));
5502  KMP_CPU_PAUSE();
5503  }
5504  }
5505 
5506  // Delete task teams
5507  int tt_idx;
5508  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5509  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5510  if (task_team != NULL) {
5511  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5512  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5513  team->t.t_threads[f]->th.th_task_team = NULL;
5514  }
5515  KA_TRACE(
5516  20,
5517  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5518  __kmp_get_gtid(), task_team, team->t.t_id));
5519 #if KMP_NESTED_HOT_TEAMS
5520  __kmp_free_task_team(master, task_team);
5521 #endif
5522  team->t.t_task_team[tt_idx] = NULL;
5523  }
5524  }
5525  }
5526 
5527  // Reset pointer to parent team only for non-hot teams.
5528  team->t.t_parent = NULL;
5529  team->t.t_level = 0;
5530  team->t.t_active_level = 0;
5531 
5532  /* free the worker threads */
5533  for (f = 1; f < team->t.t_nproc; ++f) {
5534  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5535  __kmp_free_thread(team->t.t_threads[f]);
5536  team->t.t_threads[f] = NULL;
5537  }
5538 
5539  /* put the team back in the team pool */
5540  /* TODO limit size of team pool, call reap_team if pool too large */
5541  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5542  __kmp_team_pool = (volatile kmp_team_t *)team;
5543  } else { // Check if team was created for the masters in a teams construct
5544  // See if first worker is a CG root
5545  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5546  team->t.t_threads[1]->th.th_cg_roots);
5547  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5548  // Clean up the CG root nodes on workers so that this team can be re-used
5549  for (f = 1; f < team->t.t_nproc; ++f) {
5550  kmp_info_t *thr = team->t.t_threads[f];
5551  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5552  thr->th.th_cg_roots->cg_root == thr);
5553  // Pop current CG root off list
5554  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5555  thr->th.th_cg_roots = tmp->up;
5556  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5557  " up to node %p. cg_nthreads was %d\n",
5558  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5559  int i = tmp->cg_nthreads--;
5560  if (i == 1) {
5561  __kmp_free(tmp); // free CG if we are the last thread in it
5562  }
5563  // Restore current task's thread_limit from CG root
5564  if (thr->th.th_cg_roots)
5565  thr->th.th_current_task->td_icvs.thread_limit =
5566  thr->th.th_cg_roots->cg_thread_limit;
5567  }
5568  }
5569  }
5570 
5571  KMP_MB();
5572 }
5573 
5574 /* reap the team. destroy it, reclaim all its resources and free its memory */
5575 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5576  kmp_team_t *next_pool = team->t.t_next_pool;
5577 
5578  KMP_DEBUG_ASSERT(team);
5579  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5580  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5581  KMP_DEBUG_ASSERT(team->t.t_threads);
5582  KMP_DEBUG_ASSERT(team->t.t_argv);
5583 
5584  /* TODO clean the threads that are a part of this? */
5585 
5586  /* free stuff */
5587  __kmp_free_team_arrays(team);
5588  if (team->t.t_argv != &team->t.t_inline_argv[0])
5589  __kmp_free((void *)team->t.t_argv);
5590  __kmp_free(team);
5591 
5592  KMP_MB();
5593  return next_pool;
5594 }
5595 
5596 // Free the thread. Don't reap it, just place it on the pool of available
5597 // threads.
5598 //
5599 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5600 // binding for the affinity mechanism to be useful.
5601 //
5602 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5603 // However, we want to avoid a potential performance problem by always
5604 // scanning through the list to find the correct point at which to insert
5605 // the thread (potential N**2 behavior). To do this we keep track of the
5606 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5607 // With single-level parallelism, threads will always be added to the tail
5608 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5609 // parallelism, all bets are off and we may need to scan through the entire
5610 // free list.
5611 //
5612 // This change also has a potentially large performance benefit, for some
5613 // applications. Previously, as threads were freed from the hot team, they
5614 // would be placed back on the free list in inverse order. If the hot team
5615 // grew back to it's original size, then the freed thread would be placed
5616 // back on the hot team in reverse order. This could cause bad cache
5617 // locality problems on programs where the size of the hot team regularly
5618 // grew and shrunk.
5619 //
5620 // Now, for single-level parallelism, the OMP tid is always == gtid.
5621 void __kmp_free_thread(kmp_info_t *this_th) {
5622  int gtid;
5623  kmp_info_t **scan;
5624 
5625  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5626  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5627 
5628  KMP_DEBUG_ASSERT(this_th);
5629 
5630  // When moving thread to pool, switch thread to wait on own b_go flag, and
5631  // uninitialized (NULL team).
5632  int b;
5633  kmp_balign_t *balign = this_th->th.th_bar;
5634  for (b = 0; b < bs_last_barrier; ++b) {
5635  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5636  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5637  balign[b].bb.team = NULL;
5638  balign[b].bb.leaf_kids = 0;
5639  }
5640  this_th->th.th_task_state = 0;
5641  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5642 
5643  /* put thread back on the free pool */
5644  TCW_PTR(this_th->th.th_team, NULL);
5645  TCW_PTR(this_th->th.th_root, NULL);
5646  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5647 
5648  while (this_th->th.th_cg_roots) {
5649  this_th->th.th_cg_roots->cg_nthreads--;
5650  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5651  " %p of thread %p to %d\n",
5652  this_th, this_th->th.th_cg_roots,
5653  this_th->th.th_cg_roots->cg_root,
5654  this_th->th.th_cg_roots->cg_nthreads));
5655  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5656  if (tmp->cg_root == this_th) { // Thread is a cg_root
5657  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5658  KA_TRACE(
5659  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5660  this_th->th.th_cg_roots = tmp->up;
5661  __kmp_free(tmp);
5662  } else { // Worker thread
5663  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5664  __kmp_free(tmp);
5665  }
5666  this_th->th.th_cg_roots = NULL;
5667  break;
5668  }
5669  }
5670 
5671  /* If the implicit task assigned to this thread can be used by other threads
5672  * -> multiple threads can share the data and try to free the task at
5673  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5674  * with higher probability when hot team is disabled but can occurs even when
5675  * the hot team is enabled */
5676  __kmp_free_implicit_task(this_th);
5677  this_th->th.th_current_task = NULL;
5678 
5679  // If the __kmp_thread_pool_insert_pt is already past the new insert
5680  // point, then we need to re-scan the entire list.
5681  gtid = this_th->th.th_info.ds.ds_gtid;
5682  if (__kmp_thread_pool_insert_pt != NULL) {
5683  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5684  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5685  __kmp_thread_pool_insert_pt = NULL;
5686  }
5687  }
5688 
5689  // Scan down the list to find the place to insert the thread.
5690  // scan is the address of a link in the list, possibly the address of
5691  // __kmp_thread_pool itself.
5692  //
5693  // In the absence of nested parallelism, the for loop will have 0 iterations.
5694  if (__kmp_thread_pool_insert_pt != NULL) {
5695  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5696  } else {
5697  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5698  }
5699  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5700  scan = &((*scan)->th.th_next_pool))
5701  ;
5702 
5703  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5704  // to its address.
5705  TCW_PTR(this_th->th.th_next_pool, *scan);
5706  __kmp_thread_pool_insert_pt = *scan = this_th;
5707  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5708  (this_th->th.th_info.ds.ds_gtid <
5709  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5710  TCW_4(this_th->th.th_in_pool, TRUE);
5711  __kmp_suspend_initialize_thread(this_th);
5712  __kmp_lock_suspend_mx(this_th);
5713  if (this_th->th.th_active == TRUE) {
5714  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5715  this_th->th.th_active_in_pool = TRUE;
5716  }
5717 #if KMP_DEBUG
5718  else {
5719  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5720  }
5721 #endif
5722  __kmp_unlock_suspend_mx(this_th);
5723 
5724  TCW_4(__kmp_nth, __kmp_nth - 1);
5725 
5726 #ifdef KMP_ADJUST_BLOCKTIME
5727  /* Adjust blocktime back to user setting or default if necessary */
5728  /* Middle initialization might never have occurred */
5729  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5730  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5731  if (__kmp_nth <= __kmp_avail_proc) {
5732  __kmp_zero_bt = FALSE;
5733  }
5734  }
5735 #endif /* KMP_ADJUST_BLOCKTIME */
5736 
5737  KMP_MB();
5738 }
5739 
5740 /* ------------------------------------------------------------------------ */
5741 
5742 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5743 #if OMPTARGET_PROFILING_SUPPORT
5744  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5745  // TODO: add a configuration option for time granularity
5746  if (ProfileTraceFile)
5747  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5748 #endif
5749 
5750  int gtid = this_thr->th.th_info.ds.ds_gtid;
5751  /* void *stack_data;*/
5752  kmp_team_t **volatile pteam;
5753 
5754  KMP_MB();
5755  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5756 
5757  if (__kmp_env_consistency_check) {
5758  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5759  }
5760 
5761 #if OMPT_SUPPORT
5762  ompt_data_t *thread_data;
5763  if (ompt_enabled.enabled) {
5764  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5765  *thread_data = ompt_data_none;
5766 
5767  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5768  this_thr->th.ompt_thread_info.wait_id = 0;
5769  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5770  this_thr->th.ompt_thread_info.parallel_flags = 0;
5771  if (ompt_enabled.ompt_callback_thread_begin) {
5772  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5773  ompt_thread_worker, thread_data);
5774  }
5775  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5776  }
5777 #endif
5778 
5779  /* This is the place where threads wait for work */
5780  while (!TCR_4(__kmp_global.g.g_done)) {
5781  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5782  KMP_MB();
5783 
5784  /* wait for work to do */
5785  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5786 
5787  /* No tid yet since not part of a team */
5788  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5789 
5790 #if OMPT_SUPPORT
5791  if (ompt_enabled.enabled) {
5792  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5793  }
5794 #endif
5795 
5796  pteam = &this_thr->th.th_team;
5797 
5798  /* have we been allocated? */
5799  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5800  /* we were just woken up, so run our new task */
5801  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5802  int rc;
5803  KA_TRACE(20,
5804  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5805  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5806  (*pteam)->t.t_pkfn));
5807 
5808  updateHWFPControl(*pteam);
5809 
5810 #if OMPT_SUPPORT
5811  if (ompt_enabled.enabled) {
5812  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5813  }
5814 #endif
5815 
5816  rc = (*pteam)->t.t_invoke(gtid);
5817  KMP_ASSERT(rc);
5818 
5819  KMP_MB();
5820  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5821  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5822  (*pteam)->t.t_pkfn));
5823  }
5824 #if OMPT_SUPPORT
5825  if (ompt_enabled.enabled) {
5826  /* no frame set while outside task */
5827  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5828 
5829  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5830  }
5831 #endif
5832  /* join barrier after parallel region */
5833  __kmp_join_barrier(gtid);
5834  }
5835  }
5836  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5837 
5838 #if OMPT_SUPPORT
5839  if (ompt_enabled.ompt_callback_thread_end) {
5840  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5841  }
5842 #endif
5843 
5844  this_thr->th.th_task_team = NULL;
5845  /* run the destructors for the threadprivate data for this thread */
5846  __kmp_common_destroy_gtid(gtid);
5847 
5848  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5849  KMP_MB();
5850 
5851 #if OMPTARGET_PROFILING_SUPPORT
5852  llvm::timeTraceProfilerFinishThread();
5853 #endif
5854  return this_thr;
5855 }
5856 
5857 /* ------------------------------------------------------------------------ */
5858 
5859 void __kmp_internal_end_dest(void *specific_gtid) {
5860  // Make sure no significant bits are lost
5861  int gtid;
5862  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5863 
5864  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5865  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5866  * this is because 0 is reserved for the nothing-stored case */
5867 
5868  __kmp_internal_end_thread(gtid);
5869 }
5870 
5871 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5872 
5873 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5874  __kmp_internal_end_atexit();
5875 }
5876 
5877 #endif
5878 
5879 /* [Windows] josh: when the atexit handler is called, there may still be more
5880  than one thread alive */
5881 void __kmp_internal_end_atexit(void) {
5882  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5883  /* [Windows]
5884  josh: ideally, we want to completely shutdown the library in this atexit
5885  handler, but stat code that depends on thread specific data for gtid fails
5886  because that data becomes unavailable at some point during the shutdown, so
5887  we call __kmp_internal_end_thread instead. We should eventually remove the
5888  dependency on __kmp_get_specific_gtid in the stat code and use
5889  __kmp_internal_end_library to cleanly shutdown the library.
5890 
5891  // TODO: Can some of this comment about GVS be removed?
5892  I suspect that the offending stat code is executed when the calling thread
5893  tries to clean up a dead root thread's data structures, resulting in GVS
5894  code trying to close the GVS structures for that thread, but since the stat
5895  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5896  the calling thread is cleaning up itself instead of another thread, it get
5897  confused. This happens because allowing a thread to unregister and cleanup
5898  another thread is a recent modification for addressing an issue.
5899  Based on the current design (20050722), a thread may end up
5900  trying to unregister another thread only if thread death does not trigger
5901  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5902  thread specific data destructor function to detect thread death. For
5903  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5904  is nothing. Thus, the workaround is applicable only for Windows static
5905  stat library. */
5906  __kmp_internal_end_library(-1);
5907 #if KMP_OS_WINDOWS
5908  __kmp_close_console();
5909 #endif
5910 }
5911 
5912 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5913  // It is assumed __kmp_forkjoin_lock is acquired.
5914 
5915  int gtid;
5916 
5917  KMP_DEBUG_ASSERT(thread != NULL);
5918 
5919  gtid = thread->th.th_info.ds.ds_gtid;
5920 
5921  if (!is_root) {
5922  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5923  /* Assume the threads are at the fork barrier here */
5924  KA_TRACE(
5925  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5926  gtid));
5927  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5928  * (GEH) */
5929  ANNOTATE_HAPPENS_BEFORE(thread);
5930  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5931  thread);
5932  __kmp_release_64(&flag);
5933  }
5934 
5935  // Terminate OS thread.
5936  __kmp_reap_worker(thread);
5937 
5938  // The thread was killed asynchronously. If it was actively
5939  // spinning in the thread pool, decrement the global count.
5940  //
5941  // There is a small timing hole here - if the worker thread was just waking
5942  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5943  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5944  // the global counter might not get updated.
5945  //
5946  // Currently, this can only happen as the library is unloaded,
5947  // so there are no harmful side effects.
5948  if (thread->th.th_active_in_pool) {
5949  thread->th.th_active_in_pool = FALSE;
5950  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5951  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5952  }
5953  }
5954 
5955  __kmp_free_implicit_task(thread);
5956 
5957 // Free the fast memory for tasking
5958 #if USE_FAST_MEMORY
5959  __kmp_free_fast_memory(thread);
5960 #endif /* USE_FAST_MEMORY */
5961 
5962  __kmp_suspend_uninitialize_thread(thread);
5963 
5964  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5965  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5966 
5967  --__kmp_all_nth;
5968 // __kmp_nth was decremented when thread is added to the pool.
5969 
5970 #ifdef KMP_ADJUST_BLOCKTIME
5971  /* Adjust blocktime back to user setting or default if necessary */
5972  /* Middle initialization might never have occurred */
5973  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5974  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5975  if (__kmp_nth <= __kmp_avail_proc) {
5976  __kmp_zero_bt = FALSE;
5977  }
5978  }
5979 #endif /* KMP_ADJUST_BLOCKTIME */
5980 
5981  /* free the memory being used */
5982  if (__kmp_env_consistency_check) {
5983  if (thread->th.th_cons) {
5984  __kmp_free_cons_stack(thread->th.th_cons);
5985  thread->th.th_cons = NULL;
5986  }
5987  }
5988 
5989  if (thread->th.th_pri_common != NULL) {
5990  __kmp_free(thread->th.th_pri_common);
5991  thread->th.th_pri_common = NULL;
5992  }
5993 
5994  if (thread->th.th_task_state_memo_stack != NULL) {
5995  __kmp_free(thread->th.th_task_state_memo_stack);
5996  thread->th.th_task_state_memo_stack = NULL;
5997  }
5998 
5999 #if KMP_USE_BGET
6000  if (thread->th.th_local.bget_data != NULL) {
6001  __kmp_finalize_bget(thread);
6002  }
6003 #endif
6004 
6005 #if KMP_AFFINITY_SUPPORTED
6006  if (thread->th.th_affin_mask != NULL) {
6007  KMP_CPU_FREE(thread->th.th_affin_mask);
6008  thread->th.th_affin_mask = NULL;
6009  }
6010 #endif /* KMP_AFFINITY_SUPPORTED */
6011 
6012 #if KMP_USE_HIER_SCHED
6013  if (thread->th.th_hier_bar_data != NULL) {
6014  __kmp_free(thread->th.th_hier_bar_data);
6015  thread->th.th_hier_bar_data = NULL;
6016  }
6017 #endif
6018 
6019  __kmp_reap_team(thread->th.th_serial_team);
6020  thread->th.th_serial_team = NULL;
6021  __kmp_free(thread);
6022 
6023  KMP_MB();
6024 
6025 } // __kmp_reap_thread
6026 
6027 static void __kmp_internal_end(void) {
6028  int i;
6029 
6030  /* First, unregister the library */
6031  __kmp_unregister_library();
6032 
6033 #if KMP_OS_WINDOWS
6034  /* In Win static library, we can't tell when a root actually dies, so we
6035  reclaim the data structures for any root threads that have died but not
6036  unregistered themselves, in order to shut down cleanly.
6037  In Win dynamic library we also can't tell when a thread dies. */
6038  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6039 // dead roots
6040 #endif
6041 
6042  for (i = 0; i < __kmp_threads_capacity; i++)
6043  if (__kmp_root[i])
6044  if (__kmp_root[i]->r.r_active)
6045  break;
6046  KMP_MB(); /* Flush all pending memory write invalidates. */
6047  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6048 
6049  if (i < __kmp_threads_capacity) {
6050 #if KMP_USE_MONITOR
6051  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6052  KMP_MB(); /* Flush all pending memory write invalidates. */
6053 
6054  // Need to check that monitor was initialized before reaping it. If we are
6055  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6056  // __kmp_monitor will appear to contain valid data, but it is only valid in
6057  // the parent process, not the child.
6058  // New behavior (201008): instead of keying off of the flag
6059  // __kmp_init_parallel, the monitor thread creation is keyed off
6060  // of the new flag __kmp_init_monitor.
6061  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6062  if (TCR_4(__kmp_init_monitor)) {
6063  __kmp_reap_monitor(&__kmp_monitor);
6064  TCW_4(__kmp_init_monitor, 0);
6065  }
6066  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6067  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6068 #endif // KMP_USE_MONITOR
6069  } else {
6070 /* TODO move this to cleanup code */
6071 #ifdef KMP_DEBUG
6072  /* make sure that everything has properly ended */
6073  for (i = 0; i < __kmp_threads_capacity; i++) {
6074  if (__kmp_root[i]) {
6075  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6076  // there can be uber threads alive here
6077  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6078  }
6079  }
6080 #endif
6081 
6082  KMP_MB();
6083 
6084  // Reap the worker threads.
6085  // This is valid for now, but be careful if threads are reaped sooner.
6086  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6087  // Get the next thread from the pool.
6088  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6089  __kmp_thread_pool = thread->th.th_next_pool;
6090  // Reap it.
6091  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6092  thread->th.th_next_pool = NULL;
6093  thread->th.th_in_pool = FALSE;
6094  __kmp_reap_thread(thread, 0);
6095  }
6096  __kmp_thread_pool_insert_pt = NULL;
6097 
6098  // Reap teams.
6099  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6100  // Get the next team from the pool.
6101  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6102  __kmp_team_pool = team->t.t_next_pool;
6103  // Reap it.
6104  team->t.t_next_pool = NULL;
6105  __kmp_reap_team(team);
6106  }
6107 
6108  __kmp_reap_task_teams();
6109 
6110 #if KMP_OS_UNIX
6111  // Threads that are not reaped should not access any resources since they
6112  // are going to be deallocated soon, so the shutdown sequence should wait
6113  // until all threads either exit the final spin-waiting loop or begin
6114  // sleeping after the given blocktime.
6115  for (i = 0; i < __kmp_threads_capacity; i++) {
6116  kmp_info_t *thr = __kmp_threads[i];
6117  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6118  KMP_CPU_PAUSE();
6119  }
6120 #endif
6121 
6122  for (i = 0; i < __kmp_threads_capacity; ++i) {
6123  // TBD: Add some checking...
6124  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6125  }
6126 
6127  /* Make sure all threadprivate destructors get run by joining with all
6128  worker threads before resetting this flag */
6129  TCW_SYNC_4(__kmp_init_common, FALSE);
6130 
6131  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6132  KMP_MB();
6133 
6134 #if KMP_USE_MONITOR
6135  // See note above: One of the possible fixes for CQ138434 / CQ140126
6136  //
6137  // FIXME: push both code fragments down and CSE them?
6138  // push them into __kmp_cleanup() ?
6139  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6140  if (TCR_4(__kmp_init_monitor)) {
6141  __kmp_reap_monitor(&__kmp_monitor);
6142  TCW_4(__kmp_init_monitor, 0);
6143  }
6144  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6145  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6146 #endif
6147  } /* else !__kmp_global.t_active */
6148  TCW_4(__kmp_init_gtid, FALSE);
6149  KMP_MB(); /* Flush all pending memory write invalidates. */
6150 
6151  __kmp_cleanup();
6152 #if OMPT_SUPPORT
6153  ompt_fini();
6154 #endif
6155 }
6156 
6157 void __kmp_internal_end_library(int gtid_req) {
6158  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6159  /* this shouldn't be a race condition because __kmp_internal_end() is the
6160  only place to clear __kmp_serial_init */
6161  /* we'll check this later too, after we get the lock */
6162  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6163  // redundant, because the next check will work in any case.
6164  if (__kmp_global.g.g_abort) {
6165  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6166  /* TODO abort? */
6167  return;
6168  }
6169  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6170  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6171  return;
6172  }
6173 
6174  KMP_MB(); /* Flush all pending memory write invalidates. */
6175  /* find out who we are and what we should do */
6176  {
6177  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6178  KA_TRACE(
6179  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6180  if (gtid == KMP_GTID_SHUTDOWN) {
6181  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6182  "already shutdown\n"));
6183  return;
6184  } else if (gtid == KMP_GTID_MONITOR) {
6185  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6186  "registered, or system shutdown\n"));
6187  return;
6188  } else if (gtid == KMP_GTID_DNE) {
6189  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6190  "shutdown\n"));
6191  /* we don't know who we are, but we may still shutdown the library */
6192  } else if (KMP_UBER_GTID(gtid)) {
6193  /* unregister ourselves as an uber thread. gtid is no longer valid */
6194  if (__kmp_root[gtid]->r.r_active) {
6195  __kmp_global.g.g_abort = -1;
6196  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6197  __kmp_unregister_library();
6198  KA_TRACE(10,
6199  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6200  gtid));
6201  return;
6202  } else {
6203  KA_TRACE(
6204  10,
6205  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6206  __kmp_unregister_root_current_thread(gtid);
6207  }
6208  } else {
6209 /* worker threads may call this function through the atexit handler, if they
6210  * call exit() */
6211 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6212  TODO: do a thorough shutdown instead */
6213 #ifdef DUMP_DEBUG_ON_EXIT
6214  if (__kmp_debug_buf)
6215  __kmp_dump_debug_buffer();
6216 #endif
6217  // added unregister library call here when we switch to shm linux
6218  // if we don't, it will leave lots of files in /dev/shm
6219  // cleanup shared memory file before exiting.
6220  __kmp_unregister_library();
6221  return;
6222  }
6223  }
6224  /* synchronize the termination process */
6225  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6226 
6227  /* have we already finished */
6228  if (__kmp_global.g.g_abort) {
6229  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6230  /* TODO abort? */
6231  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6232  return;
6233  }
6234  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6235  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6236  return;
6237  }
6238 
6239  /* We need this lock to enforce mutex between this reading of
6240  __kmp_threads_capacity and the writing by __kmp_register_root.
6241  Alternatively, we can use a counter of roots that is atomically updated by
6242  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6243  __kmp_internal_end_*. */
6244  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6245 
6246  /* now we can safely conduct the actual termination */
6247  __kmp_internal_end();
6248 
6249  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6250  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6251 
6252  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6253 
6254 #ifdef DUMP_DEBUG_ON_EXIT
6255  if (__kmp_debug_buf)
6256  __kmp_dump_debug_buffer();
6257 #endif
6258 
6259 #if KMP_OS_WINDOWS
6260  __kmp_close_console();
6261 #endif
6262 
6263  __kmp_fini_allocator();
6264 
6265 } // __kmp_internal_end_library
6266 
6267 void __kmp_internal_end_thread(int gtid_req) {
6268  int i;
6269 
6270  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6271  /* this shouldn't be a race condition because __kmp_internal_end() is the
6272  * only place to clear __kmp_serial_init */
6273  /* we'll check this later too, after we get the lock */
6274  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6275  // redundant, because the next check will work in any case.
6276  if (__kmp_global.g.g_abort) {
6277  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6278  /* TODO abort? */
6279  return;
6280  }
6281  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6282  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6283  return;
6284  }
6285 
6286  // If hidden helper team has been initialized, we need to deinit it
6287  if (TCR_4(__kmp_init_hidden_helper)) {
6288  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6289  // First release the main thread to let it continue its work
6290  __kmp_hidden_helper_main_thread_release();
6291  // Wait until the hidden helper team has been destroyed
6292  __kmp_hidden_helper_threads_deinitz_wait();
6293  }
6294 
6295  KMP_MB(); /* Flush all pending memory write invalidates. */
6296 
6297  /* find out who we are and what we should do */
6298  {
6299  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6300  KA_TRACE(10,
6301  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6302  if (gtid == KMP_GTID_SHUTDOWN) {
6303  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6304  "already shutdown\n"));
6305  return;
6306  } else if (gtid == KMP_GTID_MONITOR) {
6307  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6308  "registered, or system shutdown\n"));
6309  return;
6310  } else if (gtid == KMP_GTID_DNE) {
6311  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6312  "shutdown\n"));
6313  return;
6314  /* we don't know who we are */
6315  } else if (KMP_UBER_GTID(gtid)) {
6316  /* unregister ourselves as an uber thread. gtid is no longer valid */
6317  if (__kmp_root[gtid]->r.r_active) {
6318  __kmp_global.g.g_abort = -1;
6319  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6320  KA_TRACE(10,
6321  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6322  gtid));
6323  return;
6324  } else {
6325  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6326  gtid));
6327  __kmp_unregister_root_current_thread(gtid);
6328  }
6329  } else {
6330  /* just a worker thread, let's leave */
6331  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6332 
6333  if (gtid >= 0) {
6334  __kmp_threads[gtid]->th.th_task_team = NULL;
6335  }
6336 
6337  KA_TRACE(10,
6338  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6339  gtid));
6340  return;
6341  }
6342  }
6343 #if KMP_DYNAMIC_LIB
6344  if (__kmp_pause_status != kmp_hard_paused)
6345  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6346  // because we will better shutdown later in the library destructor.
6347  {
6348  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6349  return;
6350  }
6351 #endif
6352  /* synchronize the termination process */
6353  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6354 
6355  /* have we already finished */
6356  if (__kmp_global.g.g_abort) {
6357  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6358  /* TODO abort? */
6359  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6360  return;
6361  }
6362  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6363  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6364  return;
6365  }
6366 
6367  /* We need this lock to enforce mutex between this reading of
6368  __kmp_threads_capacity and the writing by __kmp_register_root.
6369  Alternatively, we can use a counter of roots that is atomically updated by
6370  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6371  __kmp_internal_end_*. */
6372 
6373  /* should we finish the run-time? are all siblings done? */
6374  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6375 
6376  for (i = 0; i < __kmp_threads_capacity; ++i) {
6377  if (KMP_UBER_GTID(i)) {
6378  KA_TRACE(
6379  10,
6380  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6381  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6382  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6383  return;
6384  }
6385  }
6386 
6387  /* now we can safely conduct the actual termination */
6388 
6389  __kmp_internal_end();
6390 
6391  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6392  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6393 
6394  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6395 
6396 #ifdef DUMP_DEBUG_ON_EXIT
6397  if (__kmp_debug_buf)
6398  __kmp_dump_debug_buffer();
6399 #endif
6400 } // __kmp_internal_end_thread
6401 
6402 // -----------------------------------------------------------------------------
6403 // Library registration stuff.
6404 
6405 static long __kmp_registration_flag = 0;
6406 // Random value used to indicate library initialization.
6407 static char *__kmp_registration_str = NULL;
6408 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6409 
6410 static inline char *__kmp_reg_status_name() {
6411 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6412  each thread. If registration and unregistration go in different threads
6413  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6414  env var can not be found, because the name will contain different pid. */
6415 // macOS* complains about name being too long with additional getuid()
6416 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6417  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6418  (int)getuid());
6419 #else
6420  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6421 #endif
6422 } // __kmp_reg_status_get
6423 
6424 void __kmp_register_library_startup(void) {
6425 
6426  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6427  int done = 0;
6428  union {
6429  double dtime;
6430  long ltime;
6431  } time;
6432 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6433  __kmp_initialize_system_tick();
6434 #endif
6435  __kmp_read_system_time(&time.dtime);
6436  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6437  __kmp_registration_str =
6438  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6439  __kmp_registration_flag, KMP_LIBRARY_FILE);
6440 
6441  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6442  __kmp_registration_str));
6443 
6444  while (!done) {
6445 
6446  char *value = NULL; // Actual value of the environment variable.
6447 
6448 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6449  char *shm_name = __kmp_str_format("/%s", name);
6450  int shm_preexist = 0;
6451  char *data1;
6452  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6453  if ((fd1 == -1) && (errno == EEXIST)) {
6454  // file didn't open because it already exists.
6455  // try opening existing file
6456  fd1 = shm_open(shm_name, O_RDWR, 0666);
6457  if (fd1 == -1) { // file didn't open
6458  // error out here
6459  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6460  __kmp_msg_null);
6461  } else {
6462  // able to open existing file
6463  shm_preexist = 1;
6464  }
6465  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6466  // already exists.
6467  // error out here.
6468  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6469  __kmp_msg_null);
6470  }
6471  if (shm_preexist == 0) {
6472  // we created SHM now set size
6473  if (ftruncate(fd1, SHM_SIZE) == -1) {
6474  // error occured setting size;
6475  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6476  KMP_ERR(errno), __kmp_msg_null);
6477  }
6478  }
6479  data1 =
6480  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6481  if (data1 == MAP_FAILED) {
6482  // failed to map shared memory
6483  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6484  __kmp_msg_null);
6485  }
6486  if (shm_preexist == 0) { // set data to SHM, set value
6487  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6488  }
6489  // Read value from either what we just wrote or existing file.
6490  value = __kmp_str_format("%s", data1); // read value from SHM
6491  munmap(data1, SHM_SIZE);
6492  close(fd1);
6493 #else // Windows and unix with static library
6494  // Set environment variable, but do not overwrite if it is exist.
6495  __kmp_env_set(name, __kmp_registration_str, 0);
6496  // read value to see if it got set
6497  value = __kmp_env_get(name);
6498 #endif
6499 
6500  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6501  done = 1; // Ok, environment variable set successfully, exit the loop.
6502  } else {
6503  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6504  // Check whether it alive or dead.
6505  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6506  char *tail = value;
6507  char *flag_addr_str = NULL;
6508  char *flag_val_str = NULL;
6509  char const *file_name = NULL;
6510  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6511  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6512  file_name = tail;
6513  if (tail != NULL) {
6514  long *flag_addr = 0;
6515  long flag_val = 0;
6516  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6517  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6518  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6519  // First, check whether environment-encoded address is mapped into
6520  // addr space.
6521  // If so, dereference it to see if it still has the right value.
6522  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6523  neighbor = 1;
6524  } else {
6525  // If not, then we know the other copy of the library is no longer
6526  // running.
6527  neighbor = 2;
6528  }
6529  }
6530  }
6531  switch (neighbor) {
6532  case 0: // Cannot parse environment variable -- neighbor status unknown.
6533  // Assume it is the incompatible format of future version of the
6534  // library. Assume the other library is alive.
6535  // WARN( ... ); // TODO: Issue a warning.
6536  file_name = "unknown library";
6537  KMP_FALLTHROUGH();
6538  // Attention! Falling to the next case. That's intentional.
6539  case 1: { // Neighbor is alive.
6540  // Check it is allowed.
6541  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6542  if (!__kmp_str_match_true(duplicate_ok)) {
6543  // That's not allowed. Issue fatal error.
6544  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6545  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6546  }
6547  KMP_INTERNAL_FREE(duplicate_ok);
6548  __kmp_duplicate_library_ok = 1;
6549  done = 1; // Exit the loop.
6550  } break;
6551  case 2: { // Neighbor is dead.
6552 
6553 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6554  // close shared memory.
6555  shm_unlink(shm_name); // this removes file in /dev/shm
6556 #else
6557  // Clear the variable and try to register library again.
6558  __kmp_env_unset(name);
6559 #endif
6560  } break;
6561  default: { KMP_DEBUG_ASSERT(0); } break;
6562  }
6563  }
6564  KMP_INTERNAL_FREE((void *)value);
6565 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6566  KMP_INTERNAL_FREE((void *)shm_name);
6567 #endif
6568  } // while
6569  KMP_INTERNAL_FREE((void *)name);
6570 
6571 } // func __kmp_register_library_startup
6572 
6573 void __kmp_unregister_library(void) {
6574 
6575  char *name = __kmp_reg_status_name();
6576  char *value = NULL;
6577 
6578 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6579  char *shm_name = __kmp_str_format("/%s", name);
6580  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6581  if (fd1 == -1) {
6582  // file did not open. return.
6583  return;
6584  }
6585  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6586  if (data1 != MAP_FAILED) {
6587  value = __kmp_str_format("%s", data1); // read value from SHM
6588  munmap(data1, SHM_SIZE);
6589  }
6590  close(fd1);
6591 #else
6592  value = __kmp_env_get(name);
6593 #endif
6594 
6595  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6596  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6597  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6598 // Ok, this is our variable. Delete it.
6599 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6600  shm_unlink(shm_name); // this removes file in /dev/shm
6601 #else
6602  __kmp_env_unset(name);
6603 #endif
6604  }
6605 
6606 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6607  KMP_INTERNAL_FREE(shm_name);
6608 #endif
6609 
6610  KMP_INTERNAL_FREE(__kmp_registration_str);
6611  KMP_INTERNAL_FREE(value);
6612  KMP_INTERNAL_FREE(name);
6613 
6614  __kmp_registration_flag = 0;
6615  __kmp_registration_str = NULL;
6616 
6617 } // __kmp_unregister_library
6618 
6619 // End of Library registration stuff.
6620 // -----------------------------------------------------------------------------
6621 
6622 #if KMP_MIC_SUPPORTED
6623 
6624 static void __kmp_check_mic_type() {
6625  kmp_cpuid_t cpuid_state = {0};
6626  kmp_cpuid_t *cs_p = &cpuid_state;
6627  __kmp_x86_cpuid(1, 0, cs_p);
6628  // We don't support mic1 at the moment
6629  if ((cs_p->eax & 0xff0) == 0xB10) {
6630  __kmp_mic_type = mic2;
6631  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6632  __kmp_mic_type = mic3;
6633  } else {
6634  __kmp_mic_type = non_mic;
6635  }
6636 }
6637 
6638 #endif /* KMP_MIC_SUPPORTED */
6639 
6640 #if KMP_HAVE_UMWAIT
6641 static void __kmp_user_level_mwait_init() {
6642  struct kmp_cpuid buf;
6643  __kmp_x86_cpuid(7, 0, &buf);
6644  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6645  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6646  __kmp_umwait_enabled));
6647 }
6648 #elif KMP_HAVE_MWAIT
6649 #ifndef AT_INTELPHIUSERMWAIT
6650 // Spurious, non-existent value that should always fail to return anything.
6651 // Will be replaced with the correct value when we know that.
6652 #define AT_INTELPHIUSERMWAIT 10000
6653 #endif
6654 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6655 // earlier OS is used to build the RTL, we'll use the following internal
6656 // function when the entry is not found.
6657 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6658 unsigned long getauxval(unsigned long) { return 0; }
6659 
6660 static void __kmp_user_level_mwait_init() {
6661  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6662  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6663  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6664  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6665  if (__kmp_mic_type == mic3) {
6666  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6667  if ((res & 0x1) || __kmp_user_level_mwait) {
6668  __kmp_mwait_enabled = TRUE;
6669  if (__kmp_user_level_mwait) {
6670  KMP_INFORM(EnvMwaitWarn);
6671  }
6672  } else {
6673  __kmp_mwait_enabled = FALSE;
6674  }
6675  }
6676  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6677  "__kmp_mwait_enabled = %d\n",
6678  __kmp_mic_type, __kmp_mwait_enabled));
6679 }
6680 #endif /* KMP_HAVE_UMWAIT */
6681 
6682 static void __kmp_do_serial_initialize(void) {
6683  int i, gtid;
6684  size_t size;
6685 
6686  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6687 
6688  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6689  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6690  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6691  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6692  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6693 
6694 #if OMPT_SUPPORT
6695  ompt_pre_init();
6696 #endif
6697 
6698  __kmp_validate_locks();
6699 
6700  /* Initialize internal memory allocator */
6701  __kmp_init_allocator();
6702 
6703  /* Register the library startup via an environment variable and check to see
6704  whether another copy of the library is already registered. */
6705 
6706  __kmp_register_library_startup();
6707 
6708  /* TODO reinitialization of library */
6709  if (TCR_4(__kmp_global.g.g_done)) {
6710  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6711  }
6712 
6713  __kmp_global.g.g_abort = 0;
6714  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6715 
6716 /* initialize the locks */
6717 #if KMP_USE_ADAPTIVE_LOCKS
6718 #if KMP_DEBUG_ADAPTIVE_LOCKS
6719  __kmp_init_speculative_stats();
6720 #endif
6721 #endif
6722 #if KMP_STATS_ENABLED
6723  __kmp_stats_init();
6724 #endif
6725  __kmp_init_lock(&__kmp_global_lock);
6726  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6727  __kmp_init_lock(&__kmp_debug_lock);
6728  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6729  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6730  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6731  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6732  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6733  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6734  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6735  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6736  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6737  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6738  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6739  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6740  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6741  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6742  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6743 #if KMP_USE_MONITOR
6744  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6745 #endif
6746  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6747 
6748  /* conduct initialization and initial setup of configuration */
6749 
6750  __kmp_runtime_initialize();
6751 
6752 #if KMP_MIC_SUPPORTED
6753  __kmp_check_mic_type();
6754 #endif
6755 
6756 // Some global variable initialization moved here from kmp_env_initialize()
6757 #ifdef KMP_DEBUG
6758  kmp_diag = 0;
6759 #endif
6760  __kmp_abort_delay = 0;
6761 
6762  // From __kmp_init_dflt_team_nth()
6763  /* assume the entire machine will be used */
6764  __kmp_dflt_team_nth_ub = __kmp_xproc;
6765  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6766  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6767  }
6768  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6769  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6770  }
6771  __kmp_max_nth = __kmp_sys_max_nth;
6772  __kmp_cg_max_nth = __kmp_sys_max_nth;
6773  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6774  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6775  __kmp_teams_max_nth = __kmp_sys_max_nth;
6776  }
6777 
6778  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6779  // part
6780  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6781 #if KMP_USE_MONITOR
6782  __kmp_monitor_wakeups =
6783  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6784  __kmp_bt_intervals =
6785  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6786 #endif
6787  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6788  __kmp_library = library_throughput;
6789  // From KMP_SCHEDULE initialization
6790  __kmp_static = kmp_sch_static_balanced;
6791 // AC: do not use analytical here, because it is non-monotonous
6792 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6793 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6794 // need to repeat assignment
6795 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6796 // bit control and barrier method control parts
6797 #if KMP_FAST_REDUCTION_BARRIER
6798 #define kmp_reduction_barrier_gather_bb ((int)1)
6799 #define kmp_reduction_barrier_release_bb ((int)1)
6800 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6801 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6802 #endif // KMP_FAST_REDUCTION_BARRIER
6803  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6804  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6805  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6806  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6807  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6808 #if KMP_FAST_REDUCTION_BARRIER
6809  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6810  // lin_64 ): hyper,1
6811  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6812  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6813  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6814  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6815  }
6816 #endif // KMP_FAST_REDUCTION_BARRIER
6817  }
6818 #if KMP_FAST_REDUCTION_BARRIER
6819 #undef kmp_reduction_barrier_release_pat
6820 #undef kmp_reduction_barrier_gather_pat
6821 #undef kmp_reduction_barrier_release_bb
6822 #undef kmp_reduction_barrier_gather_bb
6823 #endif // KMP_FAST_REDUCTION_BARRIER
6824 #if KMP_MIC_SUPPORTED
6825  if (__kmp_mic_type == mic2) { // KNC
6826  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6827  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6828  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6829  1; // forkjoin release
6830  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6831  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6832  }
6833 #if KMP_FAST_REDUCTION_BARRIER
6834  if (__kmp_mic_type == mic2) { // KNC
6835  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6836  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6837  }
6838 #endif // KMP_FAST_REDUCTION_BARRIER
6839 #endif // KMP_MIC_SUPPORTED
6840 
6841 // From KMP_CHECKS initialization
6842 #ifdef KMP_DEBUG
6843  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6844 #else
6845  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6846 #endif
6847 
6848  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6849  __kmp_foreign_tp = TRUE;
6850 
6851  __kmp_global.g.g_dynamic = FALSE;
6852  __kmp_global.g.g_dynamic_mode = dynamic_default;
6853 
6854  __kmp_env_initialize(NULL);
6855 
6856 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6857  __kmp_user_level_mwait_init();
6858 #endif
6859 // Print all messages in message catalog for testing purposes.
6860 #ifdef KMP_DEBUG
6861  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6862  if (__kmp_str_match_true(val)) {
6863  kmp_str_buf_t buffer;
6864  __kmp_str_buf_init(&buffer);
6865  __kmp_i18n_dump_catalog(&buffer);
6866  __kmp_printf("%s", buffer.str);
6867  __kmp_str_buf_free(&buffer);
6868  }
6869  __kmp_env_free(&val);
6870 #endif
6871 
6872  __kmp_threads_capacity =
6873  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6874  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6875  __kmp_tp_capacity = __kmp_default_tp_capacity(
6876  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6877 
6878  // If the library is shut down properly, both pools must be NULL. Just in
6879  // case, set them to NULL -- some memory may leak, but subsequent code will
6880  // work even if pools are not freed.
6881  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6882  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6883  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6884  __kmp_thread_pool = NULL;
6885  __kmp_thread_pool_insert_pt = NULL;
6886  __kmp_team_pool = NULL;
6887 
6888  /* Allocate all of the variable sized records */
6889  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6890  * expandable */
6891  /* Since allocation is cache-aligned, just add extra padding at the end */
6892  size =
6893  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6894  CACHE_LINE;
6895  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6896  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6897  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6898 
6899  /* init thread counts */
6900  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6901  0); // Asserts fail if the library is reinitializing and
6902  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6903  __kmp_all_nth = 0;
6904  __kmp_nth = 0;
6905 
6906  /* setup the uber master thread and hierarchy */
6907  gtid = __kmp_register_root(TRUE);
6908  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6909  KMP_ASSERT(KMP_UBER_GTID(gtid));
6910  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6911 
6912  KMP_MB(); /* Flush all pending memory write invalidates. */
6913 
6914  __kmp_common_initialize();
6915 
6916 #if KMP_OS_UNIX
6917  /* invoke the child fork handler */
6918  __kmp_register_atfork();
6919 #endif
6920 
6921 #if !KMP_DYNAMIC_LIB
6922  {
6923  /* Invoke the exit handler when the program finishes, only for static
6924  library. For dynamic library, we already have _fini and DllMain. */
6925  int rc = atexit(__kmp_internal_end_atexit);
6926  if (rc != 0) {
6927  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6928  __kmp_msg_null);
6929  }
6930  }
6931 #endif
6932 
6933 #if KMP_HANDLE_SIGNALS
6934 #if KMP_OS_UNIX
6935  /* NOTE: make sure that this is called before the user installs their own
6936  signal handlers so that the user handlers are called first. this way they
6937  can return false, not call our handler, avoid terminating the library, and
6938  continue execution where they left off. */
6939  __kmp_install_signals(FALSE);
6940 #endif /* KMP_OS_UNIX */
6941 #if KMP_OS_WINDOWS
6942  __kmp_install_signals(TRUE);
6943 #endif /* KMP_OS_WINDOWS */
6944 #endif
6945 
6946  /* we have finished the serial initialization */
6947  __kmp_init_counter++;
6948 
6949  __kmp_init_serial = TRUE;
6950 
6951  if (__kmp_settings) {
6952  __kmp_env_print();
6953  }
6954 
6955  if (__kmp_display_env || __kmp_display_env_verbose) {
6956  __kmp_env_print_2();
6957  }
6958 
6959 #if OMPT_SUPPORT
6960  ompt_post_init();
6961 #endif
6962 
6963  KMP_MB();
6964 
6965  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6966 }
6967 
6968 void __kmp_serial_initialize(void) {
6969  if (__kmp_init_serial) {
6970  return;
6971  }
6972  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6973  if (__kmp_init_serial) {
6974  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6975  return;
6976  }
6977  __kmp_do_serial_initialize();
6978  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6979 }
6980 
6981 static void __kmp_do_middle_initialize(void) {
6982  int i, j;
6983  int prev_dflt_team_nth;
6984 
6985  if (!__kmp_init_serial) {
6986  __kmp_do_serial_initialize();
6987  }
6988 
6989  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6990 
6991  // Save the previous value for the __kmp_dflt_team_nth so that
6992  // we can avoid some reinitialization if it hasn't changed.
6993  prev_dflt_team_nth = __kmp_dflt_team_nth;
6994 
6995 #if KMP_AFFINITY_SUPPORTED
6996  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6997  // number of cores on the machine.
6998  __kmp_affinity_initialize();
6999 
7000  // Run through the __kmp_threads array and set the affinity mask
7001  // for each root thread that is currently registered with the RTL.
7002  for (i = 0; i < __kmp_threads_capacity; i++) {
7003  if (TCR_PTR(__kmp_threads[i]) != NULL) {
7004  __kmp_affinity_set_init_mask(i, TRUE);
7005  }
7006  }
7007 #endif /* KMP_AFFINITY_SUPPORTED */
7008 
7009  KMP_ASSERT(__kmp_xproc > 0);
7010  if (__kmp_avail_proc == 0) {
7011  __kmp_avail_proc = __kmp_xproc;
7012  }
7013 
7014  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7015  // correct them now
7016  j = 0;
7017  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7018  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7019  __kmp_avail_proc;
7020  j++;
7021  }
7022 
7023  if (__kmp_dflt_team_nth == 0) {
7024 #ifdef KMP_DFLT_NTH_CORES
7025  // Default #threads = #cores
7026  __kmp_dflt_team_nth = __kmp_ncores;
7027  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7028  "__kmp_ncores (%d)\n",
7029  __kmp_dflt_team_nth));
7030 #else
7031  // Default #threads = #available OS procs
7032  __kmp_dflt_team_nth = __kmp_avail_proc;
7033  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7034  "__kmp_avail_proc(%d)\n",
7035  __kmp_dflt_team_nth));
7036 #endif /* KMP_DFLT_NTH_CORES */
7037  }
7038 
7039  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7040  __kmp_dflt_team_nth = KMP_MIN_NTH;
7041  }
7042  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7043  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7044  }
7045 
7046  // There's no harm in continuing if the following check fails,
7047  // but it indicates an error in the previous logic.
7048  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7049 
7050  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7051  // Run through the __kmp_threads array and set the num threads icv for each
7052  // root thread that is currently registered with the RTL (which has not
7053  // already explicitly set its nthreads-var with a call to
7054  // omp_set_num_threads()).
7055  for (i = 0; i < __kmp_threads_capacity; i++) {
7056  kmp_info_t *thread = __kmp_threads[i];
7057  if (thread == NULL)
7058  continue;
7059  if (thread->th.th_current_task->td_icvs.nproc != 0)
7060  continue;
7061 
7062  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7063  }
7064  }
7065  KA_TRACE(
7066  20,
7067  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7068  __kmp_dflt_team_nth));
7069 
7070 #ifdef KMP_ADJUST_BLOCKTIME
7071  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7072  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7073  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7074  if (__kmp_nth > __kmp_avail_proc) {
7075  __kmp_zero_bt = TRUE;
7076  }
7077  }
7078 #endif /* KMP_ADJUST_BLOCKTIME */
7079 
7080  /* we have finished middle initialization */
7081  TCW_SYNC_4(__kmp_init_middle, TRUE);
7082 
7083  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7084 }
7085 
7086 void __kmp_middle_initialize(void) {
7087  if (__kmp_init_middle) {
7088  return;
7089  }
7090  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7091  if (__kmp_init_middle) {
7092  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7093  return;
7094  }
7095  __kmp_do_middle_initialize();
7096  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7097 }
7098 
7099 void __kmp_parallel_initialize(void) {
7100  int gtid = __kmp_entry_gtid(); // this might be a new root
7101 
7102  /* synchronize parallel initialization (for sibling) */
7103  if (TCR_4(__kmp_init_parallel))
7104  return;
7105  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7106  if (TCR_4(__kmp_init_parallel)) {
7107  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7108  return;
7109  }
7110 
7111  /* TODO reinitialization after we have already shut down */
7112  if (TCR_4(__kmp_global.g.g_done)) {
7113  KA_TRACE(
7114  10,
7115  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7116  __kmp_infinite_loop();
7117  }
7118 
7119  /* jc: The lock __kmp_initz_lock is already held, so calling
7120  __kmp_serial_initialize would cause a deadlock. So we call
7121  __kmp_do_serial_initialize directly. */
7122  if (!__kmp_init_middle) {
7123  __kmp_do_middle_initialize();
7124  }
7125  __kmp_resume_if_hard_paused();
7126 
7127  /* begin initialization */
7128  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7129  KMP_ASSERT(KMP_UBER_GTID(gtid));
7130 
7131 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7132  // Save the FP control regs.
7133  // Worker threads will set theirs to these values at thread startup.
7134  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7135  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7136  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7137 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7138 
7139 #if KMP_OS_UNIX
7140 #if KMP_HANDLE_SIGNALS
7141  /* must be after __kmp_serial_initialize */
7142  __kmp_install_signals(TRUE);
7143 #endif
7144 #endif
7145 
7146  __kmp_suspend_initialize();
7147 
7148 #if defined(USE_LOAD_BALANCE)
7149  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7150  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7151  }
7152 #else
7153  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7154  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7155  }
7156 #endif
7157 
7158  if (__kmp_version) {
7159  __kmp_print_version_2();
7160  }
7161 
7162  /* we have finished parallel initialization */
7163  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7164 
7165  KMP_MB();
7166  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7167 
7168  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7169 }
7170 
7171 void __kmp_hidden_helper_initialize() {
7172  if (TCR_4(__kmp_init_hidden_helper))
7173  return;
7174 
7175  // __kmp_parallel_initialize is required before we initialize hidden helper
7176  if (!TCR_4(__kmp_init_parallel))
7177  __kmp_parallel_initialize();
7178 
7179  // Double check. Note that this double check should not be placed before
7180  // __kmp_parallel_initialize as it will cause dead lock.
7181  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7182  if (TCR_4(__kmp_init_hidden_helper)) {
7183  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7184  return;
7185  }
7186 
7187  // Set the count of hidden helper tasks to be executed to zero
7188  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7189 
7190  // Set the global variable indicating that we're initializing hidden helper
7191  // team/threads
7192  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7193 
7194  // Platform independent initialization
7195  __kmp_do_initialize_hidden_helper_threads();
7196 
7197  // Wait here for the finish of initialization of hidden helper teams
7198  __kmp_hidden_helper_threads_initz_wait();
7199 
7200  // We have finished hidden helper initialization
7201  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7202 
7203  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7204 }
7205 
7206 /* ------------------------------------------------------------------------ */
7207 
7208 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7209  kmp_team_t *team) {
7210  kmp_disp_t *dispatch;
7211 
7212  KMP_MB();
7213 
7214  /* none of the threads have encountered any constructs, yet. */
7215  this_thr->th.th_local.this_construct = 0;
7216 #if KMP_CACHE_MANAGE
7217  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7218 #endif /* KMP_CACHE_MANAGE */
7219  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7220  KMP_DEBUG_ASSERT(dispatch);
7221  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7222  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7223  // this_thr->th.th_info.ds.ds_tid ] );
7224 
7225  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7226  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7227  if (__kmp_env_consistency_check)
7228  __kmp_push_parallel(gtid, team->t.t_ident);
7229 
7230  KMP_MB(); /* Flush all pending memory write invalidates. */
7231 }
7232 
7233 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7234  kmp_team_t *team) {
7235  if (__kmp_env_consistency_check)
7236  __kmp_pop_parallel(gtid, team->t.t_ident);
7237 
7238  __kmp_finish_implicit_task(this_thr);
7239 }
7240 
7241 int __kmp_invoke_task_func(int gtid) {
7242  int rc;
7243  int tid = __kmp_tid_from_gtid(gtid);
7244  kmp_info_t *this_thr = __kmp_threads[gtid];
7245  kmp_team_t *team = this_thr->th.th_team;
7246 
7247  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7248 #if USE_ITT_BUILD
7249  if (__itt_stack_caller_create_ptr) {
7250  __kmp_itt_stack_callee_enter(
7251  (__itt_caller)
7252  team->t.t_stack_id); // inform ittnotify about entering user's code
7253  }
7254 #endif /* USE_ITT_BUILD */
7255 #if INCLUDE_SSC_MARKS
7256  SSC_MARK_INVOKING();
7257 #endif
7258 
7259 #if OMPT_SUPPORT
7260  void *dummy;
7261  void **exit_frame_p;
7262  ompt_data_t *my_task_data;
7263  ompt_data_t *my_parallel_data;
7264  int ompt_team_size;
7265 
7266  if (ompt_enabled.enabled) {
7267  exit_frame_p = &(
7268  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7269  } else {
7270  exit_frame_p = &dummy;
7271  }
7272 
7273  my_task_data =
7274  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7275  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7276  if (ompt_enabled.ompt_callback_implicit_task) {
7277  ompt_team_size = team->t.t_nproc;
7278  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7279  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7280  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7281  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7282  }
7283 #endif
7284 
7285 #if KMP_STATS_ENABLED
7286  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7287  if (previous_state == stats_state_e::TEAMS_REGION) {
7288  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7289  } else {
7290  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7291  }
7292  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7293 #endif
7294 
7295  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7296  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7297 #if OMPT_SUPPORT
7298  ,
7299  exit_frame_p
7300 #endif
7301  );
7302 #if OMPT_SUPPORT
7303  *exit_frame_p = NULL;
7304  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7305 #endif
7306 
7307 #if KMP_STATS_ENABLED
7308  if (previous_state == stats_state_e::TEAMS_REGION) {
7309  KMP_SET_THREAD_STATE(previous_state);
7310  }
7311  KMP_POP_PARTITIONED_TIMER();
7312 #endif
7313 
7314 #if USE_ITT_BUILD
7315  if (__itt_stack_caller_create_ptr) {
7316  __kmp_itt_stack_callee_leave(
7317  (__itt_caller)
7318  team->t.t_stack_id); // inform ittnotify about leaving user's code
7319  }
7320 #endif /* USE_ITT_BUILD */
7321  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7322 
7323  return rc;
7324 }
7325 
7326 void __kmp_teams_master(int gtid) {
7327  // This routine is called by all master threads in teams construct
7328  kmp_info_t *thr = __kmp_threads[gtid];
7329  kmp_team_t *team = thr->th.th_team;
7330  ident_t *loc = team->t.t_ident;
7331  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7332  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7333  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7334  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7335  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7336 
7337  // This thread is a new CG root. Set up the proper variables.
7338  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7339  tmp->cg_root = thr; // Make thr the CG root
7340  // Init to thread limit that was stored when league masters were forked
7341  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7342  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7343  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7344  " cg_nthreads to 1\n",
7345  thr, tmp));
7346  tmp->up = thr->th.th_cg_roots;
7347  thr->th.th_cg_roots = tmp;
7348 
7349 // Launch league of teams now, but not let workers execute
7350 // (they hang on fork barrier until next parallel)
7351 #if INCLUDE_SSC_MARKS
7352  SSC_MARK_FORKING();
7353 #endif
7354  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7355  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7356  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7357 #if INCLUDE_SSC_MARKS
7358  SSC_MARK_JOINING();
7359 #endif
7360  // If the team size was reduced from the limit, set it to the new size
7361  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7362  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7363  // AC: last parameter "1" eliminates join barrier which won't work because
7364  // worker threads are in a fork barrier waiting for more parallel regions
7365  __kmp_join_call(loc, gtid
7366 #if OMPT_SUPPORT
7367  ,
7368  fork_context_intel
7369 #endif
7370  ,
7371  1);
7372 }
7373 
7374 int __kmp_invoke_teams_master(int gtid) {
7375  kmp_info_t *this_thr = __kmp_threads[gtid];
7376  kmp_team_t *team = this_thr->th.th_team;
7377 #if KMP_DEBUG
7378  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7379  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7380  (void *)__kmp_teams_master);
7381 #endif
7382  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7383 #if OMPT_SUPPORT
7384  int tid = __kmp_tid_from_gtid(gtid);
7385  ompt_data_t *task_data =
7386  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7387  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7388  if (ompt_enabled.ompt_callback_implicit_task) {
7389  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7390  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7391  ompt_task_initial);
7392  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7393  }
7394 #endif
7395  __kmp_teams_master(gtid);
7396 #if OMPT_SUPPORT
7397  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7398 #endif
7399  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7400  return 1;
7401 }
7402 
7403 /* this sets the requested number of threads for the next parallel region
7404  encountered by this team. since this should be enclosed in the forkjoin
7405  critical section it should avoid race conditions with asymmetrical nested
7406  parallelism */
7407 
7408 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7409  kmp_info_t *thr = __kmp_threads[gtid];
7410 
7411  if (num_threads > 0)
7412  thr->th.th_set_nproc = num_threads;
7413 }
7414 
7415 /* this sets the requested number of teams for the teams region and/or
7416  the number of threads for the next parallel region encountered */
7417 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7418  int num_threads) {
7419  kmp_info_t *thr = __kmp_threads[gtid];
7420  KMP_DEBUG_ASSERT(num_teams >= 0);
7421  KMP_DEBUG_ASSERT(num_threads >= 0);
7422 
7423  if (num_teams == 0)
7424  num_teams = 1; // default number of teams is 1.
7425  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7426  if (!__kmp_reserve_warn) {
7427  __kmp_reserve_warn = 1;
7428  __kmp_msg(kmp_ms_warning,
7429  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7430  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7431  }
7432  num_teams = __kmp_teams_max_nth;
7433  }
7434  // Set number of teams (number of threads in the outer "parallel" of the
7435  // teams)
7436  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7437 
7438  // Remember the number of threads for inner parallel regions
7439  if (!TCR_4(__kmp_init_middle))
7440  __kmp_middle_initialize(); // get internal globals calculated
7441  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7442  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7443  if (num_threads == 0) {
7444  num_threads = __kmp_avail_proc / num_teams;
7445  // adjust num_threads w/o warning as it is not user setting
7446  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7447  // no thread_limit clause specified - do not change thread-limit-var ICV
7448  if (num_threads > __kmp_dflt_team_nth) {
7449  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7450  }
7451  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7452  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7453  } // prevent team size to exceed thread-limit-var
7454  if (num_teams * num_threads > __kmp_teams_max_nth) {
7455  num_threads = __kmp_teams_max_nth / num_teams;
7456  }
7457  } else {
7458  // This thread will be the master of the league masters
7459  // Store new thread limit; old limit is saved in th_cg_roots list
7460  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7461  // num_threads = min(num_threads, nthreads-var)
7462  if (num_threads > __kmp_dflt_team_nth) {
7463  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7464  }
7465  if (num_teams * num_threads > __kmp_teams_max_nth) {
7466  int new_threads = __kmp_teams_max_nth / num_teams;
7467  if (!__kmp_reserve_warn) { // user asked for too many threads
7468  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7469  __kmp_msg(kmp_ms_warning,
7470  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7471  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7472  }
7473  num_threads = new_threads;
7474  }
7475  }
7476  thr->th.th_teams_size.nth = num_threads;
7477 }
7478 
7479 // Set the proc_bind var to use in the following parallel region.
7480 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7481  kmp_info_t *thr = __kmp_threads[gtid];
7482  thr->th.th_set_proc_bind = proc_bind;
7483 }
7484 
7485 /* Launch the worker threads into the microtask. */
7486 
7487 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7488  kmp_info_t *this_thr = __kmp_threads[gtid];
7489 
7490 #ifdef KMP_DEBUG
7491  int f;
7492 #endif /* KMP_DEBUG */
7493 
7494  KMP_DEBUG_ASSERT(team);
7495  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7496  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7497  KMP_MB(); /* Flush all pending memory write invalidates. */
7498 
7499  team->t.t_construct = 0; /* no single directives seen yet */
7500  team->t.t_ordered.dt.t_value =
7501  0; /* thread 0 enters the ordered section first */
7502 
7503  /* Reset the identifiers on the dispatch buffer */
7504  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7505  if (team->t.t_max_nproc > 1) {
7506  int i;
7507  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7508  team->t.t_disp_buffer[i].buffer_index = i;
7509  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7510  }
7511  } else {
7512  team->t.t_disp_buffer[0].buffer_index = 0;
7513  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7514  }
7515 
7516  KMP_MB(); /* Flush all pending memory write invalidates. */
7517  KMP_ASSERT(this_thr->th.th_team == team);
7518 
7519 #ifdef KMP_DEBUG
7520  for (f = 0; f < team->t.t_nproc; f++) {
7521  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7522  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7523  }
7524 #endif /* KMP_DEBUG */
7525 
7526  /* release the worker threads so they may begin working */
7527  __kmp_fork_barrier(gtid, 0);
7528 }
7529 
7530 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7531  kmp_info_t *this_thr = __kmp_threads[gtid];
7532 
7533  KMP_DEBUG_ASSERT(team);
7534  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7535  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7536  KMP_MB(); /* Flush all pending memory write invalidates. */
7537 
7538 /* Join barrier after fork */
7539 
7540 #ifdef KMP_DEBUG
7541  if (__kmp_threads[gtid] &&
7542  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7543  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7544  __kmp_threads[gtid]);
7545  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7546  "team->t.t_nproc=%d\n",
7547  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7548  team->t.t_nproc);
7549  __kmp_print_structure();
7550  }
7551  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7552  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7553 #endif /* KMP_DEBUG */
7554 
7555  __kmp_join_barrier(gtid); /* wait for everyone */
7556 #if OMPT_SUPPORT
7557  if (ompt_enabled.enabled &&
7558  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7559  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7560  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7561  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7562 #if OMPT_OPTIONAL
7563  void *codeptr = NULL;
7564  if (KMP_MASTER_TID(ds_tid) &&
7565  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7566  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7567  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7568 
7569  if (ompt_enabled.ompt_callback_sync_region_wait) {
7570  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7571  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7572  codeptr);
7573  }
7574  if (ompt_enabled.ompt_callback_sync_region) {
7575  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7576  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7577  codeptr);
7578  }
7579 #endif
7580  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7581  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7582  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7583  }
7584  }
7585 #endif
7586 
7587  KMP_MB(); /* Flush all pending memory write invalidates. */
7588  KMP_ASSERT(this_thr->th.th_team == team);
7589 }
7590 
7591 /* ------------------------------------------------------------------------ */
7592 
7593 #ifdef USE_LOAD_BALANCE
7594 
7595 // Return the worker threads actively spinning in the hot team, if we
7596 // are at the outermost level of parallelism. Otherwise, return 0.
7597 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7598  int i;
7599  int retval;
7600  kmp_team_t *hot_team;
7601 
7602  if (root->r.r_active) {
7603  return 0;
7604  }
7605  hot_team = root->r.r_hot_team;
7606  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7607  return hot_team->t.t_nproc - 1; // Don't count master thread
7608  }
7609 
7610  // Skip the master thread - it is accounted for elsewhere.
7611  retval = 0;
7612  for (i = 1; i < hot_team->t.t_nproc; i++) {
7613  if (hot_team->t.t_threads[i]->th.th_active) {
7614  retval++;
7615  }
7616  }
7617  return retval;
7618 }
7619 
7620 // Perform an automatic adjustment to the number of
7621 // threads used by the next parallel region.
7622 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7623  int retval;
7624  int pool_active;
7625  int hot_team_active;
7626  int team_curr_active;
7627  int system_active;
7628 
7629  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7630  set_nproc));
7631  KMP_DEBUG_ASSERT(root);
7632  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7633  ->th.th_current_task->td_icvs.dynamic == TRUE);
7634  KMP_DEBUG_ASSERT(set_nproc > 1);
7635 
7636  if (set_nproc == 1) {
7637  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7638  return 1;
7639  }
7640 
7641  // Threads that are active in the thread pool, active in the hot team for this
7642  // particular root (if we are at the outer par level), and the currently
7643  // executing thread (to become the master) are available to add to the new
7644  // team, but are currently contributing to the system load, and must be
7645  // accounted for.
7646  pool_active = __kmp_thread_pool_active_nth;
7647  hot_team_active = __kmp_active_hot_team_nproc(root);
7648  team_curr_active = pool_active + hot_team_active + 1;
7649 
7650  // Check the system load.
7651  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7652  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7653  "hot team active = %d\n",
7654  system_active, pool_active, hot_team_active));
7655 
7656  if (system_active < 0) {
7657  // There was an error reading the necessary info from /proc, so use the
7658  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7659  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7660  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7661  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7662 
7663  // Make this call behave like the thread limit algorithm.
7664  retval = __kmp_avail_proc - __kmp_nth +
7665  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7666  if (retval > set_nproc) {
7667  retval = set_nproc;
7668  }
7669  if (retval < KMP_MIN_NTH) {
7670  retval = KMP_MIN_NTH;
7671  }
7672 
7673  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7674  retval));
7675  return retval;
7676  }
7677 
7678  // There is a slight delay in the load balance algorithm in detecting new
7679  // running procs. The real system load at this instant should be at least as
7680  // large as the #active omp thread that are available to add to the team.
7681  if (system_active < team_curr_active) {
7682  system_active = team_curr_active;
7683  }
7684  retval = __kmp_avail_proc - system_active + team_curr_active;
7685  if (retval > set_nproc) {
7686  retval = set_nproc;
7687  }
7688  if (retval < KMP_MIN_NTH) {
7689  retval = KMP_MIN_NTH;
7690  }
7691 
7692  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7693  return retval;
7694 } // __kmp_load_balance_nproc()
7695 
7696 #endif /* USE_LOAD_BALANCE */
7697 
7698 /* ------------------------------------------------------------------------ */
7699 
7700 /* NOTE: this is called with the __kmp_init_lock held */
7701 void __kmp_cleanup(void) {
7702  int f;
7703 
7704  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7705 
7706  if (TCR_4(__kmp_init_parallel)) {
7707 #if KMP_HANDLE_SIGNALS
7708  __kmp_remove_signals();
7709 #endif
7710  TCW_4(__kmp_init_parallel, FALSE);
7711  }
7712 
7713  if (TCR_4(__kmp_init_middle)) {
7714 #if KMP_AFFINITY_SUPPORTED
7715  __kmp_affinity_uninitialize();
7716 #endif /* KMP_AFFINITY_SUPPORTED */
7717  __kmp_cleanup_hierarchy();
7718  TCW_4(__kmp_init_middle, FALSE);
7719  }
7720 
7721  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7722 
7723  if (__kmp_init_serial) {
7724  __kmp_runtime_destroy();
7725  __kmp_init_serial = FALSE;
7726  }
7727 
7728  __kmp_cleanup_threadprivate_caches();
7729 
7730  for (f = 0; f < __kmp_threads_capacity; f++) {
7731  if (__kmp_root[f] != NULL) {
7732  __kmp_free(__kmp_root[f]);
7733  __kmp_root[f] = NULL;
7734  }
7735  }
7736  __kmp_free(__kmp_threads);
7737  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7738  // there is no need in freeing __kmp_root.
7739  __kmp_threads = NULL;
7740  __kmp_root = NULL;
7741  __kmp_threads_capacity = 0;
7742 
7743 #if KMP_USE_DYNAMIC_LOCK
7744  __kmp_cleanup_indirect_user_locks();
7745 #else
7746  __kmp_cleanup_user_locks();
7747 #endif
7748 
7749 #if KMP_AFFINITY_SUPPORTED
7750  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7751  __kmp_cpuinfo_file = NULL;
7752 #endif /* KMP_AFFINITY_SUPPORTED */
7753 
7754 #if KMP_USE_ADAPTIVE_LOCKS
7755 #if KMP_DEBUG_ADAPTIVE_LOCKS
7756  __kmp_print_speculative_stats();
7757 #endif
7758 #endif
7759  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7760  __kmp_nested_nth.nth = NULL;
7761  __kmp_nested_nth.size = 0;
7762  __kmp_nested_nth.used = 0;
7763  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7764  __kmp_nested_proc_bind.bind_types = NULL;
7765  __kmp_nested_proc_bind.size = 0;
7766  __kmp_nested_proc_bind.used = 0;
7767  if (__kmp_affinity_format) {
7768  KMP_INTERNAL_FREE(__kmp_affinity_format);
7769  __kmp_affinity_format = NULL;
7770  }
7771 
7772  __kmp_i18n_catclose();
7773 
7774 #if KMP_USE_HIER_SCHED
7775  __kmp_hier_scheds.deallocate();
7776 #endif
7777 
7778 #if KMP_STATS_ENABLED
7779  __kmp_stats_fini();
7780 #endif
7781 
7782  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7783 }
7784 
7785 /* ------------------------------------------------------------------------ */
7786 
7787 int __kmp_ignore_mppbeg(void) {
7788  char *env;
7789 
7790  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7791  if (__kmp_str_match_false(env))
7792  return FALSE;
7793  }
7794  // By default __kmpc_begin() is no-op.
7795  return TRUE;
7796 }
7797 
7798 int __kmp_ignore_mppend(void) {
7799  char *env;
7800 
7801  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7802  if (__kmp_str_match_false(env))
7803  return FALSE;
7804  }
7805  // By default __kmpc_end() is no-op.
7806  return TRUE;
7807 }
7808 
7809 void __kmp_internal_begin(void) {
7810  int gtid;
7811  kmp_root_t *root;
7812 
7813  /* this is a very important step as it will register new sibling threads
7814  and assign these new uber threads a new gtid */
7815  gtid = __kmp_entry_gtid();
7816  root = __kmp_threads[gtid]->th.th_root;
7817  KMP_ASSERT(KMP_UBER_GTID(gtid));
7818 
7819  if (root->r.r_begin)
7820  return;
7821  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7822  if (root->r.r_begin) {
7823  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7824  return;
7825  }
7826 
7827  root->r.r_begin = TRUE;
7828 
7829  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7830 }
7831 
7832 /* ------------------------------------------------------------------------ */
7833 
7834 void __kmp_user_set_library(enum library_type arg) {
7835  int gtid;
7836  kmp_root_t *root;
7837  kmp_info_t *thread;
7838 
7839  /* first, make sure we are initialized so we can get our gtid */
7840 
7841  gtid = __kmp_entry_gtid();
7842  thread = __kmp_threads[gtid];
7843 
7844  root = thread->th.th_root;
7845 
7846  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7847  library_serial));
7848  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7849  thread */
7850  KMP_WARNING(SetLibraryIncorrectCall);
7851  return;
7852  }
7853 
7854  switch (arg) {
7855  case library_serial:
7856  thread->th.th_set_nproc = 0;
7857  set__nproc(thread, 1);
7858  break;
7859  case library_turnaround:
7860  thread->th.th_set_nproc = 0;
7861  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7862  : __kmp_dflt_team_nth_ub);
7863  break;
7864  case library_throughput:
7865  thread->th.th_set_nproc = 0;
7866  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7867  : __kmp_dflt_team_nth_ub);
7868  break;
7869  default:
7870  KMP_FATAL(UnknownLibraryType, arg);
7871  }
7872 
7873  __kmp_aux_set_library(arg);
7874 }
7875 
7876 void __kmp_aux_set_stacksize(size_t arg) {
7877  if (!__kmp_init_serial)
7878  __kmp_serial_initialize();
7879 
7880 #if KMP_OS_DARWIN
7881  if (arg & (0x1000 - 1)) {
7882  arg &= ~(0x1000 - 1);
7883  if (arg + 0x1000) /* check for overflow if we round up */
7884  arg += 0x1000;
7885  }
7886 #endif
7887  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7888 
7889  /* only change the default stacksize before the first parallel region */
7890  if (!TCR_4(__kmp_init_parallel)) {
7891  size_t value = arg; /* argument is in bytes */
7892 
7893  if (value < __kmp_sys_min_stksize)
7894  value = __kmp_sys_min_stksize;
7895  else if (value > KMP_MAX_STKSIZE)
7896  value = KMP_MAX_STKSIZE;
7897 
7898  __kmp_stksize = value;
7899 
7900  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7901  }
7902 
7903  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7904 }
7905 
7906 /* set the behaviour of the runtime library */
7907 /* TODO this can cause some odd behaviour with sibling parallelism... */
7908 void __kmp_aux_set_library(enum library_type arg) {
7909  __kmp_library = arg;
7910 
7911  switch (__kmp_library) {
7912  case library_serial: {
7913  KMP_INFORM(LibraryIsSerial);
7914  } break;
7915  case library_turnaround:
7916  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7917  __kmp_use_yield = 2; // only yield when oversubscribed
7918  break;
7919  case library_throughput:
7920  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7921  __kmp_dflt_blocktime = 200;
7922  break;
7923  default:
7924  KMP_FATAL(UnknownLibraryType, arg);
7925  }
7926 }
7927 
7928 /* Getting team information common for all team API */
7929 // Returns NULL if not in teams construct
7930 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7931  kmp_info_t *thr = __kmp_entry_thread();
7932  teams_serialized = 0;
7933  if (thr->th.th_teams_microtask) {
7934  kmp_team_t *team = thr->th.th_team;
7935  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7936  int ii = team->t.t_level;
7937  teams_serialized = team->t.t_serialized;
7938  int level = tlevel + 1;
7939  KMP_DEBUG_ASSERT(ii >= tlevel);
7940  while (ii > level) {
7941  for (teams_serialized = team->t.t_serialized;
7942  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7943  }
7944  if (team->t.t_serialized && (!teams_serialized)) {
7945  team = team->t.t_parent;
7946  continue;
7947  }
7948  if (ii > level) {
7949  team = team->t.t_parent;
7950  ii--;
7951  }
7952  }
7953  return team;
7954  }
7955  return NULL;
7956 }
7957 
7958 int __kmp_aux_get_team_num() {
7959  int serialized;
7960  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7961  if (team) {
7962  if (serialized > 1) {
7963  return 0; // teams region is serialized ( 1 team of 1 thread ).
7964  } else {
7965  return team->t.t_master_tid;
7966  }
7967  }
7968  return 0;
7969 }
7970 
7971 int __kmp_aux_get_num_teams() {
7972  int serialized;
7973  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7974  if (team) {
7975  if (serialized > 1) {
7976  return 1;
7977  } else {
7978  return team->t.t_parent->t.t_nproc;
7979  }
7980  }
7981  return 1;
7982 }
7983 
7984 /* ------------------------------------------------------------------------ */
7985 
7986 /*
7987  * Affinity Format Parser
7988  *
7989  * Field is in form of: %[[[0].]size]type
7990  * % and type are required (%% means print a literal '%')
7991  * type is either single char or long name surrounded by {},
7992  * e.g., N or {num_threads}
7993  * 0 => leading zeros
7994  * . => right justified when size is specified
7995  * by default output is left justified
7996  * size is the *minimum* field length
7997  * All other characters are printed as is
7998  *
7999  * Available field types:
8000  * L {thread_level} - omp_get_level()
8001  * n {thread_num} - omp_get_thread_num()
8002  * h {host} - name of host machine
8003  * P {process_id} - process id (integer)
8004  * T {thread_identifier} - native thread identifier (integer)
8005  * N {num_threads} - omp_get_num_threads()
8006  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8007  * a {thread_affinity} - comma separated list of integers or integer ranges
8008  * (values of affinity mask)
8009  *
8010  * Implementation-specific field types can be added
8011  * If a type is unknown, print "undefined"
8012 */
8013 
8014 // Structure holding the short name, long name, and corresponding data type
8015 // for snprintf. A table of these will represent the entire valid keyword
8016 // field types.
8017 typedef struct kmp_affinity_format_field_t {
8018  char short_name; // from spec e.g., L -> thread level
8019  const char *long_name; // from spec thread_level -> thread level
8020  char field_format; // data type for snprintf (typically 'd' or 's'
8021  // for integer or string)
8022 } kmp_affinity_format_field_t;
8023 
8024 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8025 #if KMP_AFFINITY_SUPPORTED
8026  {'A', "thread_affinity", 's'},
8027 #endif
8028  {'t', "team_num", 'd'},
8029  {'T', "num_teams", 'd'},
8030  {'L', "nesting_level", 'd'},
8031  {'n', "thread_num", 'd'},
8032  {'N', "num_threads", 'd'},
8033  {'a', "ancestor_tnum", 'd'},
8034  {'H', "host", 's'},
8035  {'P', "process_id", 'd'},
8036  {'i', "native_thread_id", 'd'}};
8037 
8038 // Return the number of characters it takes to hold field
8039 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8040  const char **ptr,
8041  kmp_str_buf_t *field_buffer) {
8042  int rc, format_index, field_value;
8043  const char *width_left, *width_right;
8044  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8045  static const int FORMAT_SIZE = 20;
8046  char format[FORMAT_SIZE] = {0};
8047  char absolute_short_name = 0;
8048 
8049  KMP_DEBUG_ASSERT(gtid >= 0);
8050  KMP_DEBUG_ASSERT(th);
8051  KMP_DEBUG_ASSERT(**ptr == '%');
8052  KMP_DEBUG_ASSERT(field_buffer);
8053 
8054  __kmp_str_buf_clear(field_buffer);
8055 
8056  // Skip the initial %
8057  (*ptr)++;
8058 
8059  // Check for %% first
8060  if (**ptr == '%') {
8061  __kmp_str_buf_cat(field_buffer, "%", 1);
8062  (*ptr)++; // skip over the second %
8063  return 1;
8064  }
8065 
8066  // Parse field modifiers if they are present
8067  pad_zeros = false;
8068  if (**ptr == '0') {
8069  pad_zeros = true;
8070  (*ptr)++; // skip over 0
8071  }
8072  right_justify = false;
8073  if (**ptr == '.') {
8074  right_justify = true;
8075  (*ptr)++; // skip over .
8076  }
8077  // Parse width of field: [width_left, width_right)
8078  width_left = width_right = NULL;
8079  if (**ptr >= '0' && **ptr <= '9') {
8080  width_left = *ptr;
8081  SKIP_DIGITS(*ptr);
8082  width_right = *ptr;
8083  }
8084 
8085  // Create the format for KMP_SNPRINTF based on flags parsed above
8086  format_index = 0;
8087  format[format_index++] = '%';
8088  if (!right_justify)
8089  format[format_index++] = '-';
8090  if (pad_zeros)
8091  format[format_index++] = '0';
8092  if (width_left && width_right) {
8093  int i = 0;
8094  // Only allow 8 digit number widths.
8095  // This also prevents overflowing format variable
8096  while (i < 8 && width_left < width_right) {
8097  format[format_index++] = *width_left;
8098  width_left++;
8099  i++;
8100  }
8101  }
8102 
8103  // Parse a name (long or short)
8104  // Canonicalize the name into absolute_short_name
8105  found_valid_name = false;
8106  parse_long_name = (**ptr == '{');
8107  if (parse_long_name)
8108  (*ptr)++; // skip initial left brace
8109  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8110  sizeof(__kmp_affinity_format_table[0]);
8111  ++i) {
8112  char short_name = __kmp_affinity_format_table[i].short_name;
8113  const char *long_name = __kmp_affinity_format_table[i].long_name;
8114  char field_format = __kmp_affinity_format_table[i].field_format;
8115  if (parse_long_name) {
8116  size_t length = KMP_STRLEN(long_name);
8117  if (strncmp(*ptr, long_name, length) == 0) {
8118  found_valid_name = true;
8119  (*ptr) += length; // skip the long name
8120  }
8121  } else if (**ptr == short_name) {
8122  found_valid_name = true;
8123  (*ptr)++; // skip the short name
8124  }
8125  if (found_valid_name) {
8126  format[format_index++] = field_format;
8127  format[format_index++] = '\0';
8128  absolute_short_name = short_name;
8129  break;
8130  }
8131  }
8132  if (parse_long_name) {
8133  if (**ptr != '}') {
8134  absolute_short_name = 0;
8135  } else {
8136  (*ptr)++; // skip over the right brace
8137  }
8138  }
8139 
8140  // Attempt to fill the buffer with the requested
8141  // value using snprintf within __kmp_str_buf_print()
8142  switch (absolute_short_name) {
8143  case 't':
8144  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8145  break;
8146  case 'T':
8147  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8148  break;
8149  case 'L':
8150  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8151  break;
8152  case 'n':
8153  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8154  break;
8155  case 'H': {
8156  static const int BUFFER_SIZE = 256;
8157  char buf[BUFFER_SIZE];
8158  __kmp_expand_host_name(buf, BUFFER_SIZE);
8159  rc = __kmp_str_buf_print(field_buffer, format, buf);
8160  } break;
8161  case 'P':
8162  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8163  break;
8164  case 'i':
8165  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8166  break;
8167  case 'N':
8168  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8169  break;
8170  case 'a':
8171  field_value =
8172  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8173  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8174  break;
8175 #if KMP_AFFINITY_SUPPORTED
8176  case 'A': {
8177  kmp_str_buf_t buf;
8178  __kmp_str_buf_init(&buf);
8179  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8180  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8181  __kmp_str_buf_free(&buf);
8182  } break;
8183 #endif
8184  default:
8185  // According to spec, If an implementation does not have info for field
8186  // type, then "undefined" is printed
8187  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8188  // Skip the field
8189  if (parse_long_name) {
8190  SKIP_TOKEN(*ptr);
8191  if (**ptr == '}')
8192  (*ptr)++;
8193  } else {
8194  (*ptr)++;
8195  }
8196  }
8197 
8198  KMP_ASSERT(format_index <= FORMAT_SIZE);
8199  return rc;
8200 }
8201 
8202 /*
8203  * Return number of characters needed to hold the affinity string
8204  * (not including null byte character)
8205  * The resultant string is printed to buffer, which the caller can then
8206  * handle afterwards
8207 */
8208 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8209  kmp_str_buf_t *buffer) {
8210  const char *parse_ptr;
8211  size_t retval;
8212  const kmp_info_t *th;
8213  kmp_str_buf_t field;
8214 
8215  KMP_DEBUG_ASSERT(buffer);
8216  KMP_DEBUG_ASSERT(gtid >= 0);
8217 
8218  __kmp_str_buf_init(&field);
8219  __kmp_str_buf_clear(buffer);
8220 
8221  th = __kmp_threads[gtid];
8222  retval = 0;
8223 
8224  // If format is NULL or zero-length string, then we use
8225  // affinity-format-var ICV
8226  parse_ptr = format;
8227  if (parse_ptr == NULL || *parse_ptr == '\0') {
8228  parse_ptr = __kmp_affinity_format;
8229  }
8230  KMP_DEBUG_ASSERT(parse_ptr);
8231 
8232  while (*parse_ptr != '\0') {
8233  // Parse a field
8234  if (*parse_ptr == '%') {
8235  // Put field in the buffer
8236  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8237  __kmp_str_buf_catbuf(buffer, &field);
8238  retval += rc;
8239  } else {
8240  // Put literal character in buffer
8241  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8242  retval++;
8243  parse_ptr++;
8244  }
8245  }
8246  __kmp_str_buf_free(&field);
8247  return retval;
8248 }
8249 
8250 // Displays the affinity string to stdout
8251 void __kmp_aux_display_affinity(int gtid, const char *format) {
8252  kmp_str_buf_t buf;
8253  __kmp_str_buf_init(&buf);
8254  __kmp_aux_capture_affinity(gtid, format, &buf);
8255  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8256  __kmp_str_buf_free(&buf);
8257 }
8258 
8259 /* ------------------------------------------------------------------------ */
8260 
8261 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8262  int blocktime = arg; /* argument is in milliseconds */
8263 #if KMP_USE_MONITOR
8264  int bt_intervals;
8265 #endif
8266  kmp_int8 bt_set;
8267 
8268  __kmp_save_internal_controls(thread);
8269 
8270  /* Normalize and set blocktime for the teams */
8271  if (blocktime < KMP_MIN_BLOCKTIME)
8272  blocktime = KMP_MIN_BLOCKTIME;
8273  else if (blocktime > KMP_MAX_BLOCKTIME)
8274  blocktime = KMP_MAX_BLOCKTIME;
8275 
8276  set__blocktime_team(thread->th.th_team, tid, blocktime);
8277  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8278 
8279 #if KMP_USE_MONITOR
8280  /* Calculate and set blocktime intervals for the teams */
8281  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8282 
8283  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8284  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8285 #endif
8286 
8287  /* Set whether blocktime has been set to "TRUE" */
8288  bt_set = TRUE;
8289 
8290  set__bt_set_team(thread->th.th_team, tid, bt_set);
8291  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8292 #if KMP_USE_MONITOR
8293  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8294  "bt_intervals=%d, monitor_updates=%d\n",
8295  __kmp_gtid_from_tid(tid, thread->th.th_team),
8296  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8297  __kmp_monitor_wakeups));
8298 #else
8299  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8300  __kmp_gtid_from_tid(tid, thread->th.th_team),
8301  thread->th.th_team->t.t_id, tid, blocktime));
8302 #endif
8303 }
8304 
8305 void __kmp_aux_set_defaults(char const *str, size_t len) {
8306  if (!__kmp_init_serial) {
8307  __kmp_serial_initialize();
8308  }
8309  __kmp_env_initialize(str);
8310 
8311  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8312  __kmp_env_print();
8313  }
8314 } // __kmp_aux_set_defaults
8315 
8316 /* ------------------------------------------------------------------------ */
8317 /* internal fast reduction routines */
8318 
8319 PACKED_REDUCTION_METHOD_T
8320 __kmp_determine_reduction_method(
8321  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8322  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8323  kmp_critical_name *lck) {
8324 
8325  // Default reduction method: critical construct ( lck != NULL, like in current
8326  // PAROPT )
8327  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8328  // can be selected by RTL
8329  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8330  // can be selected by RTL
8331  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8332  // among generated by PAROPT.
8333 
8334  PACKED_REDUCTION_METHOD_T retval;
8335 
8336  int team_size;
8337 
8338  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8339  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8340 
8341 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8342  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8343 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8344 
8345  retval = critical_reduce_block;
8346 
8347  // another choice of getting a team size (with 1 dynamic deference) is slower
8348  team_size = __kmp_get_team_num_threads(global_tid);
8349  if (team_size == 1) {
8350 
8351  retval = empty_reduce_block;
8352 
8353  } else {
8354 
8355  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8356 
8357 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8358  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8359 
8360 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8361  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8362 
8363  int teamsize_cutoff = 4;
8364 
8365 #if KMP_MIC_SUPPORTED
8366  if (__kmp_mic_type != non_mic) {
8367  teamsize_cutoff = 8;
8368  }
8369 #endif
8370  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8371  if (tree_available) {
8372  if (team_size <= teamsize_cutoff) {
8373  if (atomic_available) {
8374  retval = atomic_reduce_block;
8375  }
8376  } else {
8377  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8378  }
8379  } else if (atomic_available) {
8380  retval = atomic_reduce_block;
8381  }
8382 #else
8383 #error "Unknown or unsupported OS"
8384 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8385  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8386 
8387 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8388 
8389 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8390 
8391  // basic tuning
8392 
8393  if (atomic_available) {
8394  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8395  retval = atomic_reduce_block;
8396  }
8397  } // otherwise: use critical section
8398 
8399 #elif KMP_OS_DARWIN
8400 
8401  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8402  if (atomic_available && (num_vars <= 3)) {
8403  retval = atomic_reduce_block;
8404  } else if (tree_available) {
8405  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8406  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8407  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8408  }
8409  } // otherwise: use critical section
8410 
8411 #else
8412 #error "Unknown or unsupported OS"
8413 #endif
8414 
8415 #else
8416 #error "Unknown or unsupported architecture"
8417 #endif
8418  }
8419 
8420  // KMP_FORCE_REDUCTION
8421 
8422  // If the team is serialized (team_size == 1), ignore the forced reduction
8423  // method and stay with the unsynchronized method (empty_reduce_block)
8424  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8425  team_size != 1) {
8426 
8427  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8428 
8429  int atomic_available, tree_available;
8430 
8431  switch ((forced_retval = __kmp_force_reduction_method)) {
8432  case critical_reduce_block:
8433  KMP_ASSERT(lck); // lck should be != 0
8434  break;
8435 
8436  case atomic_reduce_block:
8437  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8438  if (!atomic_available) {
8439  KMP_WARNING(RedMethodNotSupported, "atomic");
8440  forced_retval = critical_reduce_block;
8441  }
8442  break;
8443 
8444  case tree_reduce_block:
8445  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8446  if (!tree_available) {
8447  KMP_WARNING(RedMethodNotSupported, "tree");
8448  forced_retval = critical_reduce_block;
8449  } else {
8450 #if KMP_FAST_REDUCTION_BARRIER
8451  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8452 #endif
8453  }
8454  break;
8455 
8456  default:
8457  KMP_ASSERT(0); // "unsupported method specified"
8458  }
8459 
8460  retval = forced_retval;
8461  }
8462 
8463  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8464 
8465 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8466 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8467 
8468  return (retval);
8469 }
8470 // this function is for testing set/get/determine reduce method
8471 kmp_int32 __kmp_get_reduce_method(void) {
8472  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8473 }
8474 
8475 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8476 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8477 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8478 
8479 // Hard pause shuts down the runtime completely. Resume happens naturally when
8480 // OpenMP is used subsequently.
8481 void __kmp_hard_pause() {
8482  __kmp_pause_status = kmp_hard_paused;
8483  __kmp_internal_end_thread(-1);
8484 }
8485 
8486 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8487 void __kmp_resume_if_soft_paused() {
8488  if (__kmp_pause_status == kmp_soft_paused) {
8489  __kmp_pause_status = kmp_not_paused;
8490 
8491  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8492  kmp_info_t *thread = __kmp_threads[gtid];
8493  if (thread) { // Wake it if sleeping
8494  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8495  thread);
8496  if (fl.is_sleeping())
8497  fl.resume(gtid);
8498  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8499  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8500  } else { // thread holds the lock and may sleep soon
8501  do { // until either the thread sleeps, or we can get the lock
8502  if (fl.is_sleeping()) {
8503  fl.resume(gtid);
8504  break;
8505  } else if (__kmp_try_suspend_mx(thread)) {
8506  __kmp_unlock_suspend_mx(thread);
8507  break;
8508  }
8509  } while (1);
8510  }
8511  }
8512  }
8513  }
8514 }
8515 
8516 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8517 // TODO: add warning messages
8518 int __kmp_pause_resource(kmp_pause_status_t level) {
8519  if (level == kmp_not_paused) { // requesting resume
8520  if (__kmp_pause_status == kmp_not_paused) {
8521  // error message about runtime not being paused, so can't resume
8522  return 1;
8523  } else {
8524  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8525  __kmp_pause_status == kmp_hard_paused);
8526  __kmp_pause_status = kmp_not_paused;
8527  return 0;
8528  }
8529  } else if (level == kmp_soft_paused) { // requesting soft pause
8530  if (__kmp_pause_status != kmp_not_paused) {
8531  // error message about already being paused
8532  return 1;
8533  } else {
8534  __kmp_soft_pause();
8535  return 0;
8536  }
8537  } else if (level == kmp_hard_paused) { // requesting hard pause
8538  if (__kmp_pause_status != kmp_not_paused) {
8539  // error message about already being paused
8540  return 1;
8541  } else {
8542  __kmp_hard_pause();
8543  return 0;
8544  }
8545  } else {
8546  // error message about invalid level
8547  return 1;
8548  }
8549 }
8550 
8551 void __kmp_omp_display_env(int verbose) {
8552  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8553  if (__kmp_init_serial == 0)
8554  __kmp_do_serial_initialize();
8555  __kmp_display_env_impl(!verbose, verbose);
8556  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8557 }
8558 
8559 // Globals and functions for hidden helper task
8560 kmp_info_t **__kmp_hidden_helper_threads;
8561 kmp_info_t *__kmp_hidden_helper_main_thread;
8562 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8563 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8564 #if KMP_OS_LINUX
8565 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8566 #else
8567 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8568 #endif
8569 
8570 namespace {
8571 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8572 
8573 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8574  // This is an explicit synchronization on all hidden helper threads in case
8575  // that when a regular thread pushes a hidden helper task to one hidden
8576  // helper thread, the thread has not been awaken once since they're released
8577  // by the main thread after creating the team.
8578  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8579  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8580  __kmp_hidden_helper_threads_num)
8581  ;
8582 
8583  // If main thread, then wait for signal
8584  if (__kmpc_master(nullptr, *gtid)) {
8585  // First, unset the initial state and release the initial thread
8586  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8587  __kmp_hidden_helper_initz_release();
8588  __kmp_hidden_helper_main_thread_wait();
8589  // Now wake up all worker threads
8590  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8591  __kmp_hidden_helper_worker_thread_signal();
8592  }
8593  }
8594 }
8595 } // namespace
8596 
8597 void __kmp_hidden_helper_threads_initz_routine() {
8598  // Create a new root for hidden helper team/threads
8599  const int gtid = __kmp_register_root(TRUE);
8600  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8601  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8602  __kmp_hidden_helper_main_thread->th.th_set_nproc =
8603  __kmp_hidden_helper_threads_num;
8604 
8605  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8606 
8607  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8608 
8609  // Set the initialization flag to FALSE
8610  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8611 
8612  __kmp_hidden_helper_threads_deinitz_release();
8613 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:194
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:930
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:888
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:351
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:358
@ kmp_sch_static
Definition: kmp.h:354
@ kmp_sch_guided_chunked
Definition: kmp.h:356
Definition: kmp.h:229
kmp_int32 flags
Definition: kmp.h:231