LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 #include "kmp_dispatch.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 
36 /* these are temporary issues to be dealt with */
37 #define KMP_USE_PRCTL 0
38 
39 #if KMP_OS_WINDOWS
40 #include <process.h>
41 #endif
42 
43 #include "tsan_annotations.h"
44 
45 #if defined(KMP_GOMP_COMPAT)
46 char const __kmp_version_alt_comp[] =
47  KMP_VERSION_PREFIX "alternative compiler support: yes";
48 #endif /* defined(KMP_GOMP_COMPAT) */
49 
50 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
51 #if OMP_50_ENABLED
52  "5.0 (201611)";
53 #elif OMP_45_ENABLED
54  "4.5 (201511)";
55 #elif OMP_40_ENABLED
56  "4.0 (201307)";
57 #else
58  "3.1 (201107)";
59 #endif
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63  KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79  int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81  kmp_internal_control_t *new_icvs,
82  ident_t *loc);
83 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85  int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91  kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109  int i;
110  kmp_info_t **other_threads;
111  size_t stack_data;
112  char *stack_addr;
113  size_t stack_size;
114  char *stack_base;
115 
116  KA_TRACE(
117  1000,
118  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
119  __kmp_nth, __kmp_all_nth));
120 
121  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124  __kmp_init_gtid for this to work. */
125 
126  if (!TCR_4(__kmp_init_gtid))
127  return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130  if (TCR_4(__kmp_gtid_mode) >= 3) {
131  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132  return __kmp_gtid;
133  }
134 #endif
135  if (TCR_4(__kmp_gtid_mode) >= 2) {
136  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137  return __kmp_gtid_get_specific();
138  }
139  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141  stack_addr = (char *)&stack_data;
142  other_threads = __kmp_threads;
143 
144  /* ATT: The code below is a source of potential bugs due to unsynchronized
145  access to __kmp_threads array. For example:
146  1. Current thread loads other_threads[i] to thr and checks it, it is
147  non-NULL.
148  2. Current thread is suspended by OS.
149  3. Another thread unregisters and finishes (debug versions of free()
150  may fill memory with something like 0xEF).
151  4. Current thread is resumed.
152  5. Current thread reads junk from *thr.
153  TODO: Fix it. --ln */
154 
155  for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158  if (!thr)
159  continue;
160 
161  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164  /* stack grows down -- search through all of the active threads */
165 
166  if (stack_addr <= stack_base) {
167  size_t stack_diff = stack_base - stack_addr;
168 
169  if (stack_diff <= stack_size) {
170  /* The only way we can be closer than the allocated */
171  /* stack size is if we are running on this thread. */
172  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173  return i;
174  }
175  }
176  }
177 
178  /* get specific to try and determine our gtid */
179  KA_TRACE(1000,
180  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181  "thread, using TLS\n"));
182  i = __kmp_gtid_get_specific();
183 
184  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
185 
186  /* if we havn't been assigned a gtid, then return code */
187  if (i < 0)
188  return i;
189 
190  /* dynamically updated stack window for uber threads to avoid get_specific
191  call */
192  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193  KMP_FATAL(StackOverflow, i);
194  }
195 
196  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197  if (stack_addr > stack_base) {
198  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201  stack_base);
202  } else {
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204  stack_base - stack_addr);
205  }
206 
207  /* Reprint stack bounds for ubermaster since they have been refined */
208  if (__kmp_storage_map) {
209  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212  other_threads[i]->th.th_info.ds.ds_stacksize,
213  "th_%d stack (refinement)", i);
214  }
215  return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219  int gtid;
220 
221  if (!__kmp_init_serial) {
222  gtid = KMP_GTID_DNE;
223  } else
224 #ifdef KMP_TDATA_GTID
225  if (TCR_4(__kmp_gtid_mode) >= 3) {
226  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227  gtid = __kmp_gtid;
228  } else
229 #endif
230  if (TCR_4(__kmp_gtid_mode) >= 2) {
231  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232  gtid = __kmp_gtid_get_specific();
233  } else {
234  KA_TRACE(1000,
235  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236  gtid = __kmp_get_global_thread_id();
237  }
238 
239  /* we must be a new uber master sibling thread */
240  if (gtid == KMP_GTID_DNE) {
241  KA_TRACE(10,
242  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243  "Registering a new gtid.\n"));
244  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245  if (!__kmp_init_serial) {
246  __kmp_do_serial_initialize();
247  gtid = __kmp_gtid_get_specific();
248  } else {
249  gtid = __kmp_register_root(FALSE);
250  }
251  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253  }
254 
255  KMP_DEBUG_ASSERT(gtid >= 0);
256 
257  return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262  int f;
263  char *stack_beg = NULL;
264  char *stack_end = NULL;
265  int gtid;
266 
267  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268  if (__kmp_storage_map) {
269  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272  gtid = __kmp_gtid_from_thread(th);
273 
274  if (gtid == KMP_GTID_MONITOR) {
275  __kmp_print_storage_map_gtid(
276  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277  "th_%s stack (%s)", "mon",
278  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279  } else {
280  __kmp_print_storage_map_gtid(
281  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282  "th_%d stack (%s)", gtid,
283  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284  }
285  }
286 
287  /* No point in checking ubermaster threads since they use refinement and
288  * cannot overlap */
289  gtid = __kmp_gtid_from_thread(th);
290  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291  KA_TRACE(10,
292  ("__kmp_check_stack_overlap: performing extensive checking\n"));
293  if (stack_beg == NULL) {
294  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296  }
297 
298  for (f = 0; f < __kmp_threads_capacity; f++) {
299  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301  if (f_th && f_th != th) {
302  char *other_stack_end =
303  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304  char *other_stack_beg =
305  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309  /* Print the other stack values before the abort */
310  if (__kmp_storage_map)
311  __kmp_print_storage_map_gtid(
312  -1, other_stack_beg, other_stack_end,
313  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317  __kmp_msg_null);
318  }
319  }
320  }
321  }
322  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328  static int done = FALSE;
329 
330  while (!done) {
331  KMP_YIELD(1);
332  }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338  char const *format, ...) {
339  char buffer[MAX_MESSAGE];
340  va_list ap;
341 
342  va_start(ap, format);
343  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344  p2, (unsigned long)size, format);
345  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346  __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348  int node;
349  if (gtid >= 0) {
350  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351  if (__kmp_storage_map_verbose) {
352  node = __kmp_get_host_node(p1);
353  if (node < 0) /* doesn't work, so don't try this next time */
354  __kmp_storage_map_verbose = FALSE;
355  else {
356  char *last;
357  int lastNode;
358  int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360  const int page_size = KMP_GET_PAGE_SIZE();
361 
362  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364  if (localProc >= 0)
365  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
366  localProc >> 1);
367  else
368  __kmp_printf_no_lock(" GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370  /* The more elaborate format is disabled for now because of the prctl
371  * hanging bug. */
372  do {
373  last = p1;
374  lastNode = node;
375  /* This loop collates adjacent pages with the same host node. */
376  do {
377  (char *)p1 += page_size;
378  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
380  lastNode);
381  } while (p1 <= p2);
382 #else
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
384  (char *)p1 + (page_size - 1),
385  __kmp_get_host_node(p1));
386  if (p1 < p2) {
387  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
388  (char *)p2 + (page_size - 1),
389  __kmp_get_host_node(p2));
390  }
391 #endif
392  }
393  }
394  } else
395  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
396  }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402  char buffer[MAX_MESSAGE];
403  va_list ap;
404 
405  if (__kmp_generate_warnings == kmp_warnings_off) {
406  return;
407  }
408 
409  va_start(ap, format);
410 
411  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413  __kmp_vprintf(kmp_err, buffer, ap);
414  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416  va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420  // Later threads may stall here, but that's ok because abort() will kill them.
421  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423  if (__kmp_debug_buf) {
424  __kmp_dump_debug_buffer();
425  }
426 
427  if (KMP_OS_WINDOWS) {
428  // Let other threads know of abnormal termination and prevent deadlock
429  // if abort happened during library initialization or shutdown
430  __kmp_global.g.g_abort = SIGABRT;
431 
432  /* On Windows* OS by default abort() causes pop-up error box, which stalls
433  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434  boxes. _set_abort_behavior() works well, but this function is not
435  available in VS7 (this is not problem for DLL, but it is a problem for
436  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437  help, at least in some versions of MS C RTL.
438 
439  It seems following sequence is the only way to simulate abort() and
440  avoid pop-up error box. */
441  raise(SIGABRT);
442  _exit(3); // Just in case, if signal ignored, exit anyway.
443  } else {
444  abort();
445  }
446 
447  __kmp_infinite_loop();
448  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453  // TODO: Eliminate g_abort global variable and this function.
454  // In case of abort just call abort(), it will kill all the threads.
455  __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459  that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463  gtid);
464 
465  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469  sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471  __kmp_print_storage_map_gtid(
472  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476  &thr->th.th_bar[bs_plain_barrier + 1],
477  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478  gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481  &thr->th.th_bar[bs_forkjoin_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483  gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487  &thr->th.th_bar[bs_reduction_barrier + 1],
488  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489  gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494  that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497  int team_id, int num_thr) {
498  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500  header, team_id);
501 
502  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503  &team->t.t_bar[bs_last_barrier],
504  sizeof(kmp_balign_team_t) * bs_last_barrier,
505  "%s_%d.t_bar", header, team_id);
506 
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508  &team->t.t_bar[bs_plain_barrier + 1],
509  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513  &team->t.t_bar[bs_forkjoin_barrier + 1],
514  sizeof(kmp_balign_team_t),
515  "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519  &team->t.t_bar[bs_reduction_barrier + 1],
520  sizeof(kmp_balign_team_t),
521  "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524  __kmp_print_storage_map_gtid(
525  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528  __kmp_print_storage_map_gtid(
529  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533  &team->t.t_disp_buffer[num_disp_buff],
534  sizeof(dispatch_shared_info_t) * num_disp_buff,
535  "%s_%d.t_disp_buffer", header, team_id);
536 
537  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
538  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
539  team_id);
540 }
541 
542 static void __kmp_init_allocator() {
543 #if OMP_50_ENABLED
544  __kmp_init_memkind();
545 #endif
546 }
547 static void __kmp_fini_allocator() {
548 #if OMP_50_ENABLED
549  __kmp_fini_memkind();
550 #endif
551 }
552 
553 /* ------------------------------------------------------------------------ */
554 
555 #ifdef KMP_DYNAMIC_LIB
556 #if KMP_OS_WINDOWS
557 
558 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
559  // TODO: Change to __kmp_break_bootstrap_lock().
560  __kmp_init_bootstrap_lock(lck); // make the lock released
561 }
562 
563 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
564  int i;
565  int thread_count;
566 
567  // PROCESS_DETACH is expected to be called by a thread that executes
568  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
569  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
570  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
571  // threads can be still alive here, although being about to be terminated. The
572  // threads in the array with ds_thread==0 are most suspicious. Actually, it
573  // can be not safe to access the __kmp_threads[].
574 
575  // TODO: does it make sense to check __kmp_roots[] ?
576 
577  // Let's check that there are no other alive threads registered with the OMP
578  // lib.
579  while (1) {
580  thread_count = 0;
581  for (i = 0; i < __kmp_threads_capacity; ++i) {
582  if (!__kmp_threads)
583  continue;
584  kmp_info_t *th = __kmp_threads[i];
585  if (th == NULL)
586  continue;
587  int gtid = th->th.th_info.ds.ds_gtid;
588  if (gtid == gtid_req)
589  continue;
590  if (gtid < 0)
591  continue;
592  DWORD exit_val;
593  int alive = __kmp_is_thread_alive(th, &exit_val);
594  if (alive) {
595  ++thread_count;
596  }
597  }
598  if (thread_count == 0)
599  break; // success
600  }
601 
602  // Assume that I'm alone. Now it might be safe to check and reset locks.
603  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
604  __kmp_reset_lock(&__kmp_forkjoin_lock);
605 #ifdef KMP_DEBUG
606  __kmp_reset_lock(&__kmp_stdio_lock);
607 #endif // KMP_DEBUG
608 }
609 
610 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
611  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
612 
613  switch (fdwReason) {
614 
615  case DLL_PROCESS_ATTACH:
616  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
617 
618  return TRUE;
619 
620  case DLL_PROCESS_DETACH:
621  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
622 
623  if (lpReserved != NULL) {
624  // lpReserved is used for telling the difference:
625  // lpReserved == NULL when FreeLibrary() was called,
626  // lpReserved != NULL when the process terminates.
627  // When FreeLibrary() is called, worker threads remain alive. So they will
628  // release the forkjoin lock by themselves. When the process terminates,
629  // worker threads disappear triggering the problem of unreleased forkjoin
630  // lock as described below.
631 
632  // A worker thread can take the forkjoin lock. The problem comes up if
633  // that worker thread becomes dead before it releases the forkjoin lock.
634  // The forkjoin lock remains taken, while the thread executing
635  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
636  // to take the forkjoin lock and will always fail, so that the application
637  // will never finish [normally]. This scenario is possible if
638  // __kmpc_end() has not been executed. It looks like it's not a corner
639  // case, but common cases:
640  // - the main function was compiled by an alternative compiler;
641  // - the main function was compiled by icl but without /Qopenmp
642  // (application with plugins);
643  // - application terminates by calling C exit(), Fortran CALL EXIT() or
644  // Fortran STOP.
645  // - alive foreign thread prevented __kmpc_end from doing cleanup.
646  //
647  // This is a hack to work around the problem.
648  // TODO: !!! figure out something better.
649  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
650  }
651 
652  __kmp_internal_end_library(__kmp_gtid_get_specific());
653 
654  return TRUE;
655 
656  case DLL_THREAD_ATTACH:
657  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
658 
659  /* if we want to register new siblings all the time here call
660  * __kmp_get_gtid(); */
661  return TRUE;
662 
663  case DLL_THREAD_DETACH:
664  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
665 
666  __kmp_internal_end_thread(__kmp_gtid_get_specific());
667  return TRUE;
668  }
669 
670  return TRUE;
671 }
672 
673 #endif /* KMP_OS_WINDOWS */
674 #endif /* KMP_DYNAMIC_LIB */
675 
676 /* Change the library type to "status" and return the old type */
677 /* called from within initialization routines where __kmp_initz_lock is held */
678 int __kmp_change_library(int status) {
679  int old_status;
680 
681  old_status = __kmp_yield_init &
682  1; // check whether KMP_LIBRARY=throughput (even init count)
683 
684  if (status) {
685  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
686  } else {
687  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
688  }
689 
690  return old_status; // return previous setting of whether
691  // KMP_LIBRARY=throughput
692 }
693 
694 /* __kmp_parallel_deo -- Wait until it's our turn. */
695 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696  int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698  kmp_team_t *team = __kmp_team_from_gtid(gtid);
699 #endif /* BUILD_PARALLEL_ORDERED */
700 
701  if (__kmp_env_consistency_check) {
702  if (__kmp_threads[gtid]->th.th_root->r.r_active)
703 #if KMP_USE_DYNAMIC_LOCK
704  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
705 #else
706  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
707 #endif
708  }
709 #ifdef BUILD_PARALLEL_ORDERED
710  if (!team->t.t_serialized) {
711  KMP_MB();
712  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
713  KMP_EQ, NULL);
714  KMP_MB();
715  }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* __kmp_parallel_dxo -- Signal the next task. */
720 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
721  int gtid = *gtid_ref;
722 #ifdef BUILD_PARALLEL_ORDERED
723  int tid = __kmp_tid_from_gtid(gtid);
724  kmp_team_t *team = __kmp_team_from_gtid(gtid);
725 #endif /* BUILD_PARALLEL_ORDERED */
726 
727  if (__kmp_env_consistency_check) {
728  if (__kmp_threads[gtid]->th.th_root->r.r_active)
729  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
730  }
731 #ifdef BUILD_PARALLEL_ORDERED
732  if (!team->t.t_serialized) {
733  KMP_MB(); /* Flush all pending memory write invalidates. */
734 
735  /* use the tid of the next thread in this team */
736  /* TODO replace with general release procedure */
737  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
738 
739  KMP_MB(); /* Flush all pending memory write invalidates. */
740  }
741 #endif /* BUILD_PARALLEL_ORDERED */
742 }
743 
744 /* ------------------------------------------------------------------------ */
745 /* The BARRIER for a SINGLE process section is always explicit */
746 
747 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
748  int status;
749  kmp_info_t *th;
750  kmp_team_t *team;
751 
752  if (!TCR_4(__kmp_init_parallel))
753  __kmp_parallel_initialize();
754 
755  th = __kmp_threads[gtid];
756  team = th->th.th_team;
757  status = 0;
758 
759  th->th.th_ident = id_ref;
760 
761  if (team->t.t_serialized) {
762  status = 1;
763  } else {
764  kmp_int32 old_this = th->th.th_local.this_construct;
765 
766  ++th->th.th_local.this_construct;
767  /* try to set team count to thread count--success means thread got the
768  single block */
769  /* TODO: Should this be acquire or release? */
770  if (team->t.t_construct == old_this) {
771  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
772  th->th.th_local.this_construct);
773  }
774 #if USE_ITT_BUILD
775  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
776  KMP_MASTER_GTID(gtid) &&
777 #if OMP_40_ENABLED
778  th->th.th_teams_microtask == NULL &&
779 #endif
780  team->t.t_active_level ==
781  1) { // Only report metadata by master of active team at level 1
782  __kmp_itt_metadata_single(id_ref);
783  }
784 #endif /* USE_ITT_BUILD */
785  }
786 
787  if (__kmp_env_consistency_check) {
788  if (status && push_ws) {
789  __kmp_push_workshare(gtid, ct_psingle, id_ref);
790  } else {
791  __kmp_check_workshare(gtid, ct_psingle, id_ref);
792  }
793  }
794 #if USE_ITT_BUILD
795  if (status) {
796  __kmp_itt_single_start(gtid);
797  }
798 #endif /* USE_ITT_BUILD */
799  return status;
800 }
801 
802 void __kmp_exit_single(int gtid) {
803 #if USE_ITT_BUILD
804  __kmp_itt_single_end(gtid);
805 #endif /* USE_ITT_BUILD */
806  if (__kmp_env_consistency_check)
807  __kmp_pop_workshare(gtid, ct_psingle, NULL);
808 }
809 
810 /* determine if we can go parallel or must use a serialized parallel region and
811  * how many threads we can use
812  * set_nproc is the number of threads requested for the team
813  * returns 0 if we should serialize or only use one thread,
814  * otherwise the number of threads to use
815  * The forkjoin lock is held by the caller. */
816 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
817  int master_tid, int set_nthreads
818 #if OMP_40_ENABLED
819  ,
820  int enter_teams
821 #endif /* OMP_40_ENABLED */
822  ) {
823  int capacity;
824  int new_nthreads;
825  KMP_DEBUG_ASSERT(__kmp_init_serial);
826  KMP_DEBUG_ASSERT(root && parent_team);
827 
828  // If dyn-var is set, dynamically adjust the number of desired threads,
829  // according to the method specified by dynamic_mode.
830  new_nthreads = set_nthreads;
831  if (!get__dynamic_2(parent_team, master_tid)) {
832  ;
833  }
834 #ifdef USE_LOAD_BALANCE
835  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
836  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
837  if (new_nthreads == 1) {
838  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
839  "reservation to 1 thread\n",
840  master_tid));
841  return 1;
842  }
843  if (new_nthreads < set_nthreads) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
845  "reservation to %d threads\n",
846  master_tid, new_nthreads));
847  }
848  }
849 #endif /* USE_LOAD_BALANCE */
850  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
851  new_nthreads = __kmp_avail_proc - __kmp_nth +
852  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
853  if (new_nthreads <= 1) {
854  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
855  "reservation to 1 thread\n",
856  master_tid));
857  return 1;
858  }
859  if (new_nthreads < set_nthreads) {
860  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
861  "reservation to %d threads\n",
862  master_tid, new_nthreads));
863  } else {
864  new_nthreads = set_nthreads;
865  }
866  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
867  if (set_nthreads > 2) {
868  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
869  new_nthreads = (new_nthreads % set_nthreads) + 1;
870  if (new_nthreads == 1) {
871  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
872  "reservation to 1 thread\n",
873  master_tid));
874  return 1;
875  }
876  if (new_nthreads < set_nthreads) {
877  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
878  "reservation to %d threads\n",
879  master_tid, new_nthreads));
880  }
881  }
882  } else {
883  KMP_ASSERT(0);
884  }
885 
886  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
887  if (__kmp_nth + new_nthreads -
888  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
889  __kmp_max_nth) {
890  int tl_nthreads = __kmp_max_nth - __kmp_nth +
891  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
892  if (tl_nthreads <= 0) {
893  tl_nthreads = 1;
894  }
895 
896  // If dyn-var is false, emit a 1-time warning.
897  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
898  __kmp_reserve_warn = 1;
899  __kmp_msg(kmp_ms_warning,
900  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
901  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
902  }
903  if (tl_nthreads == 1) {
904  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
905  "reduced reservation to 1 thread\n",
906  master_tid));
907  return 1;
908  }
909  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
910  "reservation to %d threads\n",
911  master_tid, tl_nthreads));
912  new_nthreads = tl_nthreads;
913  }
914 
915  // Respect OMP_THREAD_LIMIT
916  if (root->r.r_cg_nthreads + new_nthreads -
917  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
918  __kmp_cg_max_nth) {
919  int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
920  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
921  if (tl_nthreads <= 0) {
922  tl_nthreads = 1;
923  }
924 
925  // If dyn-var is false, emit a 1-time warning.
926  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
927  __kmp_reserve_warn = 1;
928  __kmp_msg(kmp_ms_warning,
929  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
930  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
931  }
932  if (tl_nthreads == 1) {
933  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
934  "reduced reservation to 1 thread\n",
935  master_tid));
936  return 1;
937  }
938  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
939  "reservation to %d threads\n",
940  master_tid, tl_nthreads));
941  new_nthreads = tl_nthreads;
942  }
943 
944  // Check if the threads array is large enough, or needs expanding.
945  // See comment in __kmp_register_root() about the adjustment if
946  // __kmp_threads[0] == NULL.
947  capacity = __kmp_threads_capacity;
948  if (TCR_PTR(__kmp_threads[0]) == NULL) {
949  --capacity;
950  }
951  if (__kmp_nth + new_nthreads -
952  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
953  capacity) {
954  // Expand the threads array.
955  int slotsRequired = __kmp_nth + new_nthreads -
956  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
957  capacity;
958  int slotsAdded = __kmp_expand_threads(slotsRequired);
959  if (slotsAdded < slotsRequired) {
960  // The threads array was not expanded enough.
961  new_nthreads -= (slotsRequired - slotsAdded);
962  KMP_ASSERT(new_nthreads >= 1);
963 
964  // If dyn-var is false, emit a 1-time warning.
965  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
966  __kmp_reserve_warn = 1;
967  if (__kmp_tp_cached) {
968  __kmp_msg(kmp_ms_warning,
969  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
970  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
971  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
972  } else {
973  __kmp_msg(kmp_ms_warning,
974  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
975  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
976  }
977  }
978  }
979  }
980 
981 #ifdef KMP_DEBUG
982  if (new_nthreads == 1) {
983  KC_TRACE(10,
984  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
985  "dead roots and rechecking; requested %d threads\n",
986  __kmp_get_gtid(), set_nthreads));
987  } else {
988  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
989  " %d threads\n",
990  __kmp_get_gtid(), new_nthreads, set_nthreads));
991  }
992 #endif // KMP_DEBUG
993  return new_nthreads;
994 }
995 
996 /* Allocate threads from the thread pool and assign them to the new team. We are
997  assured that there are enough threads available, because we checked on that
998  earlier within critical section forkjoin */
999 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
1000  kmp_info_t *master_th, int master_gtid) {
1001  int i;
1002  int use_hot_team;
1003 
1004  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
1005  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
1006  KMP_MB();
1007 
1008  /* first, let's setup the master thread */
1009  master_th->th.th_info.ds.ds_tid = 0;
1010  master_th->th.th_team = team;
1011  master_th->th.th_team_nproc = team->t.t_nproc;
1012  master_th->th.th_team_master = master_th;
1013  master_th->th.th_team_serialized = FALSE;
1014  master_th->th.th_dispatch = &team->t.t_dispatch[0];
1015 
1016 /* make sure we are not the optimized hot team */
1017 #if KMP_NESTED_HOT_TEAMS
1018  use_hot_team = 0;
1019  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1020  if (hot_teams) { // hot teams array is not allocated if
1021  // KMP_HOT_TEAMS_MAX_LEVEL=0
1022  int level = team->t.t_active_level - 1; // index in array of hot teams
1023  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1024  if (master_th->th.th_teams_size.nteams > 1) {
1025  ++level; // level was not increased in teams construct for
1026  // team_of_masters
1027  }
1028  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1029  master_th->th.th_teams_level == team->t.t_level) {
1030  ++level; // level was not increased in teams construct for
1031  // team_of_workers before the parallel
1032  } // team->t.t_level will be increased inside parallel
1033  }
1034  if (level < __kmp_hot_teams_max_level) {
1035  if (hot_teams[level].hot_team) {
1036  // hot team has already been allocated for given level
1037  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1038  use_hot_team = 1; // the team is ready to use
1039  } else {
1040  use_hot_team = 0; // AC: threads are not allocated yet
1041  hot_teams[level].hot_team = team; // remember new hot team
1042  hot_teams[level].hot_team_nth = team->t.t_nproc;
1043  }
1044  } else {
1045  use_hot_team = 0;
1046  }
1047  }
1048 #else
1049  use_hot_team = team == root->r.r_hot_team;
1050 #endif
1051  if (!use_hot_team) {
1052 
1053  /* install the master thread */
1054  team->t.t_threads[0] = master_th;
1055  __kmp_initialize_info(master_th, team, 0, master_gtid);
1056 
1057  /* now, install the worker threads */
1058  for (i = 1; i < team->t.t_nproc; i++) {
1059 
1060  /* fork or reallocate a new thread and install it in team */
1061  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1062  team->t.t_threads[i] = thr;
1063  KMP_DEBUG_ASSERT(thr);
1064  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1065  /* align team and thread arrived states */
1066  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1067  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1068  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1069  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1070  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1071  team->t.t_bar[bs_plain_barrier].b_arrived));
1072 #if OMP_40_ENABLED
1073  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1074  thr->th.th_teams_level = master_th->th.th_teams_level;
1075  thr->th.th_teams_size = master_th->th.th_teams_size;
1076 #endif
1077  { // Initialize threads' barrier data.
1078  int b;
1079  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1080  for (b = 0; b < bs_last_barrier; ++b) {
1081  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1082  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1083 #if USE_DEBUGGER
1084  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1085 #endif
1086  }
1087  }
1088  }
1089 
1090 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1091  __kmp_partition_places(team);
1092 #endif
1093  }
1094 
1095  KMP_MB();
1096 }
1097 
1098 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1099 // Propagate any changes to the floating point control registers out to the team
1100 // We try to avoid unnecessary writes to the relevant cache line in the team
1101 // structure, so we don't make changes unless they are needed.
1102 inline static void propagateFPControl(kmp_team_t *team) {
1103  if (__kmp_inherit_fp_control) {
1104  kmp_int16 x87_fpu_control_word;
1105  kmp_uint32 mxcsr;
1106 
1107  // Get master values of FPU control flags (both X87 and vector)
1108  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109  __kmp_store_mxcsr(&mxcsr);
1110  mxcsr &= KMP_X86_MXCSR_MASK;
1111 
1112  // There is no point looking at t_fp_control_saved here.
1113  // If it is TRUE, we still have to update the values if they are different
1114  // from those we now have. If it is FALSE we didn't save anything yet, but
1115  // our objective is the same. We have to ensure that the values in the team
1116  // are the same as those we have.
1117  // So, this code achieves what we need whether or not t_fp_control_saved is
1118  // true. By checking whether the value needs updating we avoid unnecessary
1119  // writes that would put the cache-line into a written state, causing all
1120  // threads in the team to have to read it again.
1121  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1122  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1123  // Although we don't use this value, other code in the runtime wants to know
1124  // whether it should restore them. So we must ensure it is correct.
1125  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1126  } else {
1127  // Similarly here. Don't write to this cache-line in the team structure
1128  // unless we have to.
1129  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1130  }
1131 }
1132 
1133 // Do the opposite, setting the hardware registers to the updated values from
1134 // the team.
1135 inline static void updateHWFPControl(kmp_team_t *team) {
1136  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1137  // Only reset the fp control regs if they have been changed in the team.
1138  // the parallel region that we are exiting.
1139  kmp_int16 x87_fpu_control_word;
1140  kmp_uint32 mxcsr;
1141  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1142  __kmp_store_mxcsr(&mxcsr);
1143  mxcsr &= KMP_X86_MXCSR_MASK;
1144 
1145  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1146  __kmp_clear_x87_fpu_status_word();
1147  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1148  }
1149 
1150  if (team->t.t_mxcsr != mxcsr) {
1151  __kmp_load_mxcsr(&team->t.t_mxcsr);
1152  }
1153  }
1154 }
1155 #else
1156 #define propagateFPControl(x) ((void)0)
1157 #define updateHWFPControl(x) ((void)0)
1158 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1159 
1160 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1161  int realloc); // forward declaration
1162 
1163 /* Run a parallel region that has been serialized, so runs only in a team of the
1164  single master thread. */
1165 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1166  kmp_info_t *this_thr;
1167  kmp_team_t *serial_team;
1168 
1169  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1170 
1171  /* Skip all this code for autopar serialized loops since it results in
1172  unacceptable overhead */
1173  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1174  return;
1175 
1176  if (!TCR_4(__kmp_init_parallel))
1177  __kmp_parallel_initialize();
1178 
1179  this_thr = __kmp_threads[global_tid];
1180  serial_team = this_thr->th.th_serial_team;
1181 
1182  /* utilize the serialized team held by this thread */
1183  KMP_DEBUG_ASSERT(serial_team);
1184  KMP_MB();
1185 
1186  if (__kmp_tasking_mode != tskm_immediate_exec) {
1187  KMP_DEBUG_ASSERT(
1188  this_thr->th.th_task_team ==
1189  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1190  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1191  NULL);
1192  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1193  "team %p, new task_team = NULL\n",
1194  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1195  this_thr->th.th_task_team = NULL;
1196  }
1197 
1198 #if OMP_40_ENABLED
1199  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1200  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1201  proc_bind = proc_bind_false;
1202  } else if (proc_bind == proc_bind_default) {
1203  // No proc_bind clause was specified, so use the current value
1204  // of proc-bind-var for this parallel region.
1205  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1206  }
1207  // Reset for next parallel region
1208  this_thr->th.th_set_proc_bind = proc_bind_default;
1209 #endif /* OMP_40_ENABLED */
1210 
1211 #if OMPT_SUPPORT
1212  ompt_data_t ompt_parallel_data;
1213  ompt_parallel_data.ptr = NULL;
1214  ompt_data_t *implicit_task_data;
1215  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1216  if (ompt_enabled.enabled &&
1217  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1218 
1219  ompt_task_info_t *parent_task_info;
1220  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1221 
1222  parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1223  if (ompt_enabled.ompt_callback_parallel_begin) {
1224  int team_size = 1;
1225 
1226  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1227  &(parent_task_info->task_data), &(parent_task_info->frame),
1228  &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1229  codeptr);
1230  }
1231  }
1232 #endif // OMPT_SUPPORT
1233 
1234  if (this_thr->th.th_team != serial_team) {
1235  // Nested level will be an index in the nested nthreads array
1236  int level = this_thr->th.th_team->t.t_level;
1237 
1238  if (serial_team->t.t_serialized) {
1239  /* this serial team was already used
1240  TODO increase performance by making this locks more specific */
1241  kmp_team_t *new_team;
1242 
1243  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1244 
1245  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1246 #if OMPT_SUPPORT
1247  ompt_parallel_data,
1248 #endif
1249 #if OMP_40_ENABLED
1250  proc_bind,
1251 #endif
1252  &this_thr->th.th_current_task->td_icvs,
1253  0 USE_NESTED_HOT_ARG(NULL));
1254  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1255  KMP_ASSERT(new_team);
1256 
1257  /* setup new serialized team and install it */
1258  new_team->t.t_threads[0] = this_thr;
1259  new_team->t.t_parent = this_thr->th.th_team;
1260  serial_team = new_team;
1261  this_thr->th.th_serial_team = serial_team;
1262 
1263  KF_TRACE(
1264  10,
1265  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1266  global_tid, serial_team));
1267 
1268  /* TODO the above breaks the requirement that if we run out of resources,
1269  then we can still guarantee that serialized teams are ok, since we may
1270  need to allocate a new one */
1271  } else {
1272  KF_TRACE(
1273  10,
1274  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1275  global_tid, serial_team));
1276  }
1277 
1278  /* we have to initialize this serial team */
1279  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1280  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1281  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1282  serial_team->t.t_ident = loc;
1283  serial_team->t.t_serialized = 1;
1284  serial_team->t.t_nproc = 1;
1285  serial_team->t.t_parent = this_thr->th.th_team;
1286  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1287  this_thr->th.th_team = serial_team;
1288  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1289 
1290  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1291  this_thr->th.th_current_task));
1292  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1293  this_thr->th.th_current_task->td_flags.executing = 0;
1294 
1295  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1296 
1297  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1298  implicit task for each serialized task represented by
1299  team->t.t_serialized? */
1300  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1301  &this_thr->th.th_current_task->td_parent->td_icvs);
1302 
1303  // Thread value exists in the nested nthreads array for the next nested
1304  // level
1305  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1306  this_thr->th.th_current_task->td_icvs.nproc =
1307  __kmp_nested_nth.nth[level + 1];
1308  }
1309 
1310 #if OMP_40_ENABLED
1311  if (__kmp_nested_proc_bind.used &&
1312  (level + 1 < __kmp_nested_proc_bind.used)) {
1313  this_thr->th.th_current_task->td_icvs.proc_bind =
1314  __kmp_nested_proc_bind.bind_types[level + 1];
1315  }
1316 #endif /* OMP_40_ENABLED */
1317 
1318 #if USE_DEBUGGER
1319  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1320 #endif
1321  this_thr->th.th_info.ds.ds_tid = 0;
1322 
1323  /* set thread cache values */
1324  this_thr->th.th_team_nproc = 1;
1325  this_thr->th.th_team_master = this_thr;
1326  this_thr->th.th_team_serialized = 1;
1327 
1328  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1329  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1330 #if OMP_50_ENABLED
1331  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1332 #endif
1333 
1334  propagateFPControl(serial_team);
1335 
1336  /* check if we need to allocate dispatch buffers stack */
1337  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1338  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1339  serial_team->t.t_dispatch->th_disp_buffer =
1340  (dispatch_private_info_t *)__kmp_allocate(
1341  sizeof(dispatch_private_info_t));
1342  }
1343  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1344 
1345  KMP_MB();
1346 
1347  } else {
1348  /* this serialized team is already being used,
1349  * that's fine, just add another nested level */
1350  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1351  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1352  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1353  ++serial_team->t.t_serialized;
1354  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1355 
1356  // Nested level will be an index in the nested nthreads array
1357  int level = this_thr->th.th_team->t.t_level;
1358  // Thread value exists in the nested nthreads array for the next nested
1359  // level
1360  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1361  this_thr->th.th_current_task->td_icvs.nproc =
1362  __kmp_nested_nth.nth[level + 1];
1363  }
1364  serial_team->t.t_level++;
1365  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1366  "of serial team %p to %d\n",
1367  global_tid, serial_team, serial_team->t.t_level));
1368 
1369  /* allocate/push dispatch buffers stack */
1370  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1371  {
1372  dispatch_private_info_t *disp_buffer =
1373  (dispatch_private_info_t *)__kmp_allocate(
1374  sizeof(dispatch_private_info_t));
1375  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1376  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1377  }
1378  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1379 
1380  KMP_MB();
1381  }
1382 #if OMP_40_ENABLED
1383  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1384 #endif
1385 
1386  if (__kmp_env_consistency_check)
1387  __kmp_push_parallel(global_tid, NULL);
1388 #if OMPT_SUPPORT
1389  serial_team->t.ompt_team_info.master_return_address = codeptr;
1390  if (ompt_enabled.enabled &&
1391  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1392  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1393 
1394  ompt_lw_taskteam_t lw_taskteam;
1395  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1396  &ompt_parallel_data, codeptr);
1397 
1398  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1399  // don't use lw_taskteam after linking. content was swaped
1400 
1401  /* OMPT implicit task begin */
1402  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1403  if (ompt_enabled.ompt_callback_implicit_task) {
1404  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1405  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1406  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1407  OMPT_CUR_TASK_INFO(this_thr)
1408  ->thread_num = __kmp_tid_from_gtid(global_tid);
1409  }
1410 
1411  /* OMPT state */
1412  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1413  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1414  }
1415 #endif
1416 }
1417 
1418 /* most of the work for a fork */
1419 /* return true if we really went parallel, false if serialized */
1420 int __kmp_fork_call(ident_t *loc, int gtid,
1421  enum fork_context_e call_context, // Intel, GNU, ...
1422  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1423 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1424 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1425  va_list *ap
1426 #else
1427  va_list ap
1428 #endif
1429  ) {
1430  void **argv;
1431  int i;
1432  int master_tid;
1433  int master_this_cons;
1434  kmp_team_t *team;
1435  kmp_team_t *parent_team;
1436  kmp_info_t *master_th;
1437  kmp_root_t *root;
1438  int nthreads;
1439  int master_active;
1440  int master_set_numthreads;
1441  int level;
1442 #if OMP_40_ENABLED
1443  int active_level;
1444  int teams_level;
1445 #endif
1446 #if KMP_NESTED_HOT_TEAMS
1447  kmp_hot_team_ptr_t **p_hot_teams;
1448 #endif
1449  { // KMP_TIME_BLOCK
1450  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1451  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1452 
1453  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1454  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1455  /* Some systems prefer the stack for the root thread(s) to start with */
1456  /* some gap from the parent stack to prevent false sharing. */
1457  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1458  /* These 2 lines below are so this does not get optimized out */
1459  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1460  __kmp_stkpadding += (short)((kmp_int64)dummy);
1461  }
1462 
1463  /* initialize if needed */
1464  KMP_DEBUG_ASSERT(
1465  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1466  if (!TCR_4(__kmp_init_parallel))
1467  __kmp_parallel_initialize();
1468 
1469  /* setup current data */
1470  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1471  // shutdown
1472  parent_team = master_th->th.th_team;
1473  master_tid = master_th->th.th_info.ds.ds_tid;
1474  master_this_cons = master_th->th.th_local.this_construct;
1475  root = master_th->th.th_root;
1476  master_active = root->r.r_active;
1477  master_set_numthreads = master_th->th.th_set_nproc;
1478 
1479 #if OMPT_SUPPORT
1480  ompt_data_t ompt_parallel_data;
1481  ompt_parallel_data.ptr = NULL;
1482  ompt_data_t *parent_task_data;
1483  omp_frame_t *ompt_frame;
1484  ompt_data_t *implicit_task_data;
1485  void *return_address = NULL;
1486 
1487  if (ompt_enabled.enabled) {
1488  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1489  NULL, NULL);
1490  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1491  }
1492 #endif
1493 
1494  // Nested level will be an index in the nested nthreads array
1495  level = parent_team->t.t_level;
1496  // used to launch non-serial teams even if nested is not allowed
1497  active_level = parent_team->t.t_active_level;
1498 #if OMP_40_ENABLED
1499  // needed to check nesting inside the teams
1500  teams_level = master_th->th.th_teams_level;
1501 #endif
1502 #if KMP_NESTED_HOT_TEAMS
1503  p_hot_teams = &master_th->th.th_hot_teams;
1504  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1505  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1506  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1507  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1508  // it is either actual or not needed (when active_level > 0)
1509  (*p_hot_teams)[0].hot_team_nth = 1;
1510  }
1511 #endif
1512 
1513 #if OMPT_SUPPORT
1514  if (ompt_enabled.enabled) {
1515  if (ompt_enabled.ompt_callback_parallel_begin) {
1516  int team_size = master_set_numthreads
1517  ? master_set_numthreads
1518  : get__nproc_2(parent_team, master_tid);
1519  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1520  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1521  OMPT_INVOKER(call_context), return_address);
1522  }
1523  master_th->th.ompt_thread_info.state = omp_state_overhead;
1524  }
1525 #endif
1526 
1527  master_th->th.th_ident = loc;
1528 
1529 #if OMP_40_ENABLED
1530  if (master_th->th.th_teams_microtask && ap &&
1531  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1532  // AC: This is start of parallel that is nested inside teams construct.
1533  // The team is actual (hot), all workers are ready at the fork barrier.
1534  // No lock needed to initialize the team a bit, then free workers.
1535  parent_team->t.t_ident = loc;
1536  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1537  parent_team->t.t_argc = argc;
1538  argv = (void **)parent_team->t.t_argv;
1539  for (i = argc - 1; i >= 0; --i)
1540 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1541 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1542  *argv++ = va_arg(*ap, void *);
1543 #else
1544  *argv++ = va_arg(ap, void *);
1545 #endif
1546  // Increment our nested depth levels, but not increase the serialization
1547  if (parent_team == master_th->th.th_serial_team) {
1548  // AC: we are in serialized parallel
1549  __kmpc_serialized_parallel(loc, gtid);
1550  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1551  // AC: need this in order enquiry functions work
1552  // correctly, will restore at join time
1553  parent_team->t.t_serialized--;
1554 #if OMPT_SUPPORT
1555  void *dummy;
1556  void **exit_runtime_p;
1557 
1558  ompt_lw_taskteam_t lw_taskteam;
1559 
1560  if (ompt_enabled.enabled) {
1561  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1562  &ompt_parallel_data, return_address);
1563  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1564 
1565  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1566  // don't use lw_taskteam after linking. content was swaped
1567 
1568  /* OMPT implicit task begin */
1569  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1570  if (ompt_enabled.ompt_callback_implicit_task) {
1571  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1572  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1573  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1574  OMPT_CUR_TASK_INFO(master_th)
1575  ->thread_num = __kmp_tid_from_gtid(gtid);
1576  }
1577 
1578  /* OMPT state */
1579  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1580  } else {
1581  exit_runtime_p = &dummy;
1582  }
1583 #endif
1584 
1585  {
1586  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1587  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1588  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1589 #if OMPT_SUPPORT
1590  ,
1591  exit_runtime_p
1592 #endif
1593  );
1594  }
1595 
1596 #if OMPT_SUPPORT
1597  *exit_runtime_p = NULL;
1598  if (ompt_enabled.enabled) {
1599  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1600  if (ompt_enabled.ompt_callback_implicit_task) {
1601  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1602  ompt_scope_end, NULL, implicit_task_data, 1,
1603  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1604  }
1605  __ompt_lw_taskteam_unlink(master_th);
1606 
1607  if (ompt_enabled.ompt_callback_parallel_end) {
1608  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1609  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1610  OMPT_INVOKER(call_context), return_address);
1611  }
1612  master_th->th.ompt_thread_info.state = omp_state_overhead;
1613  }
1614 #endif
1615  return TRUE;
1616  }
1617 
1618  parent_team->t.t_pkfn = microtask;
1619  parent_team->t.t_invoke = invoker;
1620  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1621  parent_team->t.t_active_level++;
1622  parent_team->t.t_level++;
1623 #if OMP_50_ENABLED
1624  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1625 #endif
1626 
1627  /* Change number of threads in the team if requested */
1628  if (master_set_numthreads) { // The parallel has num_threads clause
1629  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1630  // AC: only can reduce number of threads dynamically, can't increase
1631  kmp_info_t **other_threads = parent_team->t.t_threads;
1632  parent_team->t.t_nproc = master_set_numthreads;
1633  for (i = 0; i < master_set_numthreads; ++i) {
1634  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1635  }
1636  // Keep extra threads hot in the team for possible next parallels
1637  }
1638  master_th->th.th_set_nproc = 0;
1639  }
1640 
1641 #if USE_DEBUGGER
1642  if (__kmp_debugging) { // Let debugger override number of threads.
1643  int nth = __kmp_omp_num_threads(loc);
1644  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1645  master_set_numthreads = nth;
1646  }
1647  }
1648 #endif
1649 
1650  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1651  "master_th=%p, gtid=%d\n",
1652  root, parent_team, master_th, gtid));
1653  __kmp_internal_fork(loc, gtid, parent_team);
1654  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1655  "master_th=%p, gtid=%d\n",
1656  root, parent_team, master_th, gtid));
1657 
1658  /* Invoke microtask for MASTER thread */
1659  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1660  parent_team->t.t_id, parent_team->t.t_pkfn));
1661 
1662  if (!parent_team->t.t_invoke(gtid)) {
1663  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1664  }
1665  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1666  parent_team->t.t_id, parent_team->t.t_pkfn));
1667  KMP_MB(); /* Flush all pending memory write invalidates. */
1668 
1669  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1670 
1671  return TRUE;
1672  } // Parallel closely nested in teams construct
1673 #endif /* OMP_40_ENABLED */
1674 
1675 #if KMP_DEBUG
1676  if (__kmp_tasking_mode != tskm_immediate_exec) {
1677  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1678  parent_team->t.t_task_team[master_th->th.th_task_state]);
1679  }
1680 #endif
1681 
1682  if (parent_team->t.t_active_level >=
1683  master_th->th.th_current_task->td_icvs.max_active_levels) {
1684  nthreads = 1;
1685  } else {
1686 #if OMP_40_ENABLED
1687  int enter_teams = ((ap == NULL && active_level == 0) ||
1688  (ap && teams_level > 0 && teams_level == level));
1689 #endif
1690  nthreads =
1691  master_set_numthreads
1692  ? master_set_numthreads
1693  : get__nproc_2(
1694  parent_team,
1695  master_tid); // TODO: get nproc directly from current task
1696 
1697  // Check if we need to take forkjoin lock? (no need for serialized
1698  // parallel out of teams construct). This code moved here from
1699  // __kmp_reserve_threads() to speedup nested serialized parallels.
1700  if (nthreads > 1) {
1701  if ((!get__nested(master_th) && (root->r.r_in_parallel
1702 #if OMP_40_ENABLED
1703  && !enter_teams
1704 #endif /* OMP_40_ENABLED */
1705  )) ||
1706  (__kmp_library == library_serial)) {
1707  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1708  " threads\n",
1709  gtid, nthreads));
1710  nthreads = 1;
1711  }
1712  }
1713  if (nthreads > 1) {
1714  /* determine how many new threads we can use */
1715  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1716  nthreads = __kmp_reserve_threads(
1717  root, parent_team, master_tid, nthreads
1718 #if OMP_40_ENABLED
1719  /* AC: If we execute teams from parallel region (on host), then
1720  teams should be created but each can only have 1 thread if
1721  nesting is disabled. If teams called from serial region, then
1722  teams and their threads should be created regardless of the
1723  nesting setting. */
1724  ,
1725  enter_teams
1726 #endif /* OMP_40_ENABLED */
1727  );
1728  if (nthreads == 1) {
1729  // Free lock for single thread execution here; for multi-thread
1730  // execution it will be freed later after team of threads created
1731  // and initialized
1732  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1733  }
1734  }
1735  }
1736  KMP_DEBUG_ASSERT(nthreads > 0);
1737 
1738  // If we temporarily changed the set number of threads then restore it now
1739  master_th->th.th_set_nproc = 0;
1740 
1741  /* create a serialized parallel region? */
1742  if (nthreads == 1) {
1743 /* josh todo: hypothetical question: what do we do for OS X*? */
1744 #if KMP_OS_LINUX && \
1745  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1746  void *args[argc];
1747 #else
1748  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1749 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1750  KMP_ARCH_AARCH64) */
1751 
1752  KA_TRACE(20,
1753  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1754 
1755  __kmpc_serialized_parallel(loc, gtid);
1756 
1757  if (call_context == fork_context_intel) {
1758  /* TODO this sucks, use the compiler itself to pass args! :) */
1759  master_th->th.th_serial_team->t.t_ident = loc;
1760 #if OMP_40_ENABLED
1761  if (!ap) {
1762  // revert change made in __kmpc_serialized_parallel()
1763  master_th->th.th_serial_team->t.t_level--;
1764 // Get args from parent team for teams construct
1765 
1766 #if OMPT_SUPPORT
1767  void *dummy;
1768  void **exit_runtime_p;
1769  ompt_task_info_t *task_info;
1770 
1771  ompt_lw_taskteam_t lw_taskteam;
1772 
1773  if (ompt_enabled.enabled) {
1774  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1775  &ompt_parallel_data, return_address);
1776 
1777  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1778  // don't use lw_taskteam after linking. content was swaped
1779 
1780  task_info = OMPT_CUR_TASK_INFO(master_th);
1781  exit_runtime_p = &(task_info->frame.exit_frame);
1782  if (ompt_enabled.ompt_callback_implicit_task) {
1783  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1784  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1785  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1786  OMPT_CUR_TASK_INFO(master_th)
1787  ->thread_num = __kmp_tid_from_gtid(gtid);
1788  }
1789 
1790  /* OMPT state */
1791  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1792  } else {
1793  exit_runtime_p = &dummy;
1794  }
1795 #endif
1796 
1797  {
1798  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1799  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1800  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1801  parent_team->t.t_argv
1802 #if OMPT_SUPPORT
1803  ,
1804  exit_runtime_p
1805 #endif
1806  );
1807  }
1808 
1809 #if OMPT_SUPPORT
1810  if (ompt_enabled.enabled) {
1811  exit_runtime_p = NULL;
1812  if (ompt_enabled.ompt_callback_implicit_task) {
1813  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1814  ompt_scope_end, NULL, &(task_info->task_data), 1,
1815  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1816  }
1817 
1818  __ompt_lw_taskteam_unlink(master_th);
1819  if (ompt_enabled.ompt_callback_parallel_end) {
1820  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1821  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1822  OMPT_INVOKER(call_context), return_address);
1823  }
1824  master_th->th.ompt_thread_info.state = omp_state_overhead;
1825  }
1826 #endif
1827  } else if (microtask == (microtask_t)__kmp_teams_master) {
1828  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1829  master_th->th.th_serial_team);
1830  team = master_th->th.th_team;
1831  // team->t.t_pkfn = microtask;
1832  team->t.t_invoke = invoker;
1833  __kmp_alloc_argv_entries(argc, team, TRUE);
1834  team->t.t_argc = argc;
1835  argv = (void **)team->t.t_argv;
1836  if (ap) {
1837  for (i = argc - 1; i >= 0; --i)
1838 // TODO: revert workaround for Intel(R) 64 tracker #96
1839 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1840  *argv++ = va_arg(*ap, void *);
1841 #else
1842  *argv++ = va_arg(ap, void *);
1843 #endif
1844  } else {
1845  for (i = 0; i < argc; ++i)
1846  // Get args from parent team for teams construct
1847  argv[i] = parent_team->t.t_argv[i];
1848  }
1849  // AC: revert change made in __kmpc_serialized_parallel()
1850  // because initial code in teams should have level=0
1851  team->t.t_level--;
1852  // AC: call special invoker for outer "parallel" of teams construct
1853  invoker(gtid);
1854  } else {
1855 #endif /* OMP_40_ENABLED */
1856  argv = args;
1857  for (i = argc - 1; i >= 0; --i)
1858 // TODO: revert workaround for Intel(R) 64 tracker #96
1859 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1860  *argv++ = va_arg(*ap, void *);
1861 #else
1862  *argv++ = va_arg(ap, void *);
1863 #endif
1864  KMP_MB();
1865 
1866 #if OMPT_SUPPORT
1867  void *dummy;
1868  void **exit_runtime_p;
1869  ompt_task_info_t *task_info;
1870 
1871  ompt_lw_taskteam_t lw_taskteam;
1872 
1873  if (ompt_enabled.enabled) {
1874  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1875  &ompt_parallel_data, return_address);
1876  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1877  // don't use lw_taskteam after linking. content was swaped
1878  task_info = OMPT_CUR_TASK_INFO(master_th);
1879  exit_runtime_p = &(task_info->frame.exit_frame);
1880 
1881  /* OMPT implicit task begin */
1882  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1883  if (ompt_enabled.ompt_callback_implicit_task) {
1884  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1885  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1886  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1887  OMPT_CUR_TASK_INFO(master_th)
1888  ->thread_num = __kmp_tid_from_gtid(gtid);
1889  }
1890 
1891  /* OMPT state */
1892  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1893  } else {
1894  exit_runtime_p = &dummy;
1895  }
1896 #endif
1897 
1898  {
1899  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1900  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1901  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1902 #if OMPT_SUPPORT
1903  ,
1904  exit_runtime_p
1905 #endif
1906  );
1907  }
1908 
1909 #if OMPT_SUPPORT
1910  if (ompt_enabled.enabled) {
1911  *exit_runtime_p = NULL;
1912  if (ompt_enabled.ompt_callback_implicit_task) {
1913  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1914  ompt_scope_end, NULL, &(task_info->task_data), 1,
1915  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1916  }
1917 
1918  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1919  __ompt_lw_taskteam_unlink(master_th);
1920  if (ompt_enabled.ompt_callback_parallel_end) {
1921  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1922  &ompt_parallel_data, parent_task_data,
1923  OMPT_INVOKER(call_context), return_address);
1924  }
1925  master_th->th.ompt_thread_info.state = omp_state_overhead;
1926  }
1927 #endif
1928 #if OMP_40_ENABLED
1929  }
1930 #endif /* OMP_40_ENABLED */
1931  } else if (call_context == fork_context_gnu) {
1932 #if OMPT_SUPPORT
1933  ompt_lw_taskteam_t lwt;
1934  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1935  return_address);
1936 
1937  lwt.ompt_task_info.frame.exit_frame = NULL;
1938  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1939 // don't use lw_taskteam after linking. content was swaped
1940 #endif
1941 
1942  // we were called from GNU native code
1943  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1944  return FALSE;
1945  } else {
1946  KMP_ASSERT2(call_context < fork_context_last,
1947  "__kmp_fork_call: unknown fork_context parameter");
1948  }
1949 
1950  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1951  KMP_MB();
1952  return FALSE;
1953  }
1954 
1955  // GEH: only modify the executing flag in the case when not serialized
1956  // serialized case is handled in kmpc_serialized_parallel
1957  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1958  "curtask=%p, curtask_max_aclevel=%d\n",
1959  parent_team->t.t_active_level, master_th,
1960  master_th->th.th_current_task,
1961  master_th->th.th_current_task->td_icvs.max_active_levels));
1962  // TODO: GEH - cannot do this assertion because root thread not set up as
1963  // executing
1964  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1965  master_th->th.th_current_task->td_flags.executing = 0;
1966 
1967 #if OMP_40_ENABLED
1968  if (!master_th->th.th_teams_microtask || level > teams_level)
1969 #endif /* OMP_40_ENABLED */
1970  {
1971  /* Increment our nested depth level */
1972  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1973  }
1974 
1975  // See if we need to make a copy of the ICVs.
1976  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1977  if ((level + 1 < __kmp_nested_nth.used) &&
1978  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1979  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1980  } else {
1981  nthreads_icv = 0; // don't update
1982  }
1983 
1984 #if OMP_40_ENABLED
1985  // Figure out the proc_bind_policy for the new team.
1986  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1987  kmp_proc_bind_t proc_bind_icv =
1988  proc_bind_default; // proc_bind_default means don't update
1989  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1990  proc_bind = proc_bind_false;
1991  } else {
1992  if (proc_bind == proc_bind_default) {
1993  // No proc_bind clause specified; use current proc-bind-var for this
1994  // parallel region
1995  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1996  }
1997  /* else: The proc_bind policy was specified explicitly on parallel clause.
1998  This overrides proc-bind-var for this parallel region, but does not
1999  change proc-bind-var. */
2000  // Figure the value of proc-bind-var for the child threads.
2001  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2002  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2003  master_th->th.th_current_task->td_icvs.proc_bind)) {
2004  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2005  }
2006  }
2007 
2008  // Reset for next parallel region
2009  master_th->th.th_set_proc_bind = proc_bind_default;
2010 #endif /* OMP_40_ENABLED */
2011 
2012  if ((nthreads_icv > 0)
2013 #if OMP_40_ENABLED
2014  || (proc_bind_icv != proc_bind_default)
2015 #endif /* OMP_40_ENABLED */
2016  ) {
2017  kmp_internal_control_t new_icvs;
2018  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2019  new_icvs.next = NULL;
2020  if (nthreads_icv > 0) {
2021  new_icvs.nproc = nthreads_icv;
2022  }
2023 
2024 #if OMP_40_ENABLED
2025  if (proc_bind_icv != proc_bind_default) {
2026  new_icvs.proc_bind = proc_bind_icv;
2027  }
2028 #endif /* OMP_40_ENABLED */
2029 
2030  /* allocate a new parallel team */
2031  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2032  team = __kmp_allocate_team(root, nthreads, nthreads,
2033 #if OMPT_SUPPORT
2034  ompt_parallel_data,
2035 #endif
2036 #if OMP_40_ENABLED
2037  proc_bind,
2038 #endif
2039  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2040  } else {
2041  /* allocate a new parallel team */
2042  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2043  team = __kmp_allocate_team(root, nthreads, nthreads,
2044 #if OMPT_SUPPORT
2045  ompt_parallel_data,
2046 #endif
2047 #if OMP_40_ENABLED
2048  proc_bind,
2049 #endif
2050  &master_th->th.th_current_task->td_icvs,
2051  argc USE_NESTED_HOT_ARG(master_th));
2052  }
2053  KF_TRACE(
2054  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2055 
2056  /* setup the new team */
2057  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2058  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2059  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2060  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2061  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2062 #if OMPT_SUPPORT
2063  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2064  return_address);
2065 #endif
2066  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2067 // TODO: parent_team->t.t_level == INT_MAX ???
2068 #if OMP_40_ENABLED
2069  if (!master_th->th.th_teams_microtask || level > teams_level) {
2070 #endif /* OMP_40_ENABLED */
2071  int new_level = parent_team->t.t_level + 1;
2072  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2073  new_level = parent_team->t.t_active_level + 1;
2074  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2075 #if OMP_40_ENABLED
2076  } else {
2077  // AC: Do not increase parallel level at start of the teams construct
2078  int new_level = parent_team->t.t_level;
2079  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2080  new_level = parent_team->t.t_active_level;
2081  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2082  }
2083 #endif /* OMP_40_ENABLED */
2084  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2085  // set master's schedule as new run-time schedule
2086  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2087 
2088 #if OMP_40_ENABLED
2089  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2090 #endif
2091 #if OMP_50_ENABLED
2092  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2093 #endif
2094 
2095  // Update the floating point rounding in the team if required.
2096  propagateFPControl(team);
2097 
2098  if (__kmp_tasking_mode != tskm_immediate_exec) {
2099  // Set master's task team to team's task team. Unless this is hot team, it
2100  // should be NULL.
2101  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2102  parent_team->t.t_task_team[master_th->th.th_task_state]);
2103  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2104  "%p, new task_team %p / team %p\n",
2105  __kmp_gtid_from_thread(master_th),
2106  master_th->th.th_task_team, parent_team,
2107  team->t.t_task_team[master_th->th.th_task_state], team));
2108 
2109  if (active_level || master_th->th.th_task_team) {
2110  // Take a memo of master's task_state
2111  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2112  if (master_th->th.th_task_state_top >=
2113  master_th->th.th_task_state_stack_sz) { // increase size
2114  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2115  kmp_uint8 *old_stack, *new_stack;
2116  kmp_uint32 i;
2117  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2118  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2119  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2120  }
2121  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2122  ++i) { // zero-init rest of stack
2123  new_stack[i] = 0;
2124  }
2125  old_stack = master_th->th.th_task_state_memo_stack;
2126  master_th->th.th_task_state_memo_stack = new_stack;
2127  master_th->th.th_task_state_stack_sz = new_size;
2128  __kmp_free(old_stack);
2129  }
2130  // Store master's task_state on stack
2131  master_th->th
2132  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2133  master_th->th.th_task_state;
2134  master_th->th.th_task_state_top++;
2135 #if KMP_NESTED_HOT_TEAMS
2136  if (master_th->th.th_hot_teams &&
2137  team == master_th->th.th_hot_teams[active_level].hot_team) {
2138  // Restore master's nested state if nested hot team
2139  master_th->th.th_task_state =
2140  master_th->th
2141  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2142  } else {
2143 #endif
2144  master_th->th.th_task_state = 0;
2145 #if KMP_NESTED_HOT_TEAMS
2146  }
2147 #endif
2148  }
2149 #if !KMP_NESTED_HOT_TEAMS
2150  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2151  (team == root->r.r_hot_team));
2152 #endif
2153  }
2154 
2155  KA_TRACE(
2156  20,
2157  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2158  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2159  team->t.t_nproc));
2160  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2161  (team->t.t_master_tid == 0 &&
2162  (team->t.t_parent == root->r.r_root_team ||
2163  team->t.t_parent->t.t_serialized)));
2164  KMP_MB();
2165 
2166  /* now, setup the arguments */
2167  argv = (void **)team->t.t_argv;
2168 #if OMP_40_ENABLED
2169  if (ap) {
2170 #endif /* OMP_40_ENABLED */
2171  for (i = argc - 1; i >= 0; --i) {
2172 // TODO: revert workaround for Intel(R) 64 tracker #96
2173 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2174  void *new_argv = va_arg(*ap, void *);
2175 #else
2176  void *new_argv = va_arg(ap, void *);
2177 #endif
2178  KMP_CHECK_UPDATE(*argv, new_argv);
2179  argv++;
2180  }
2181 #if OMP_40_ENABLED
2182  } else {
2183  for (i = 0; i < argc; ++i) {
2184  // Get args from parent team for teams construct
2185  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2186  }
2187  }
2188 #endif /* OMP_40_ENABLED */
2189 
2190  /* now actually fork the threads */
2191  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2192  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2193  root->r.r_active = TRUE;
2194 
2195  __kmp_fork_team_threads(root, team, master_th, gtid);
2196  __kmp_setup_icv_copy(team, nthreads,
2197  &master_th->th.th_current_task->td_icvs, loc);
2198 
2199 #if OMPT_SUPPORT
2200  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2201 #endif
2202 
2203  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2204 
2205 #if USE_ITT_BUILD
2206  if (team->t.t_active_level == 1 // only report frames at level 1
2207 #if OMP_40_ENABLED
2208  && !master_th->th.th_teams_microtask // not in teams construct
2209 #endif /* OMP_40_ENABLED */
2210  ) {
2211 #if USE_ITT_NOTIFY
2212  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2213  (__kmp_forkjoin_frames_mode == 3 ||
2214  __kmp_forkjoin_frames_mode == 1)) {
2215  kmp_uint64 tmp_time = 0;
2216  if (__itt_get_timestamp_ptr)
2217  tmp_time = __itt_get_timestamp();
2218  // Internal fork - report frame begin
2219  master_th->th.th_frame_time = tmp_time;
2220  if (__kmp_forkjoin_frames_mode == 3)
2221  team->t.t_region_time = tmp_time;
2222  } else
2223 // only one notification scheme (either "submit" or "forking/joined", not both)
2224 #endif /* USE_ITT_NOTIFY */
2225  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2226  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2227  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2228  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2229  }
2230  }
2231 #endif /* USE_ITT_BUILD */
2232 
2233  /* now go on and do the work */
2234  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2235  KMP_MB();
2236  KF_TRACE(10,
2237  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2238  root, team, master_th, gtid));
2239 
2240 #if USE_ITT_BUILD
2241  if (__itt_stack_caller_create_ptr) {
2242  team->t.t_stack_id =
2243  __kmp_itt_stack_caller_create(); // create new stack stitching id
2244  // before entering fork barrier
2245  }
2246 #endif /* USE_ITT_BUILD */
2247 
2248 #if OMP_40_ENABLED
2249  // AC: skip __kmp_internal_fork at teams construct, let only master
2250  // threads execute
2251  if (ap)
2252 #endif /* OMP_40_ENABLED */
2253  {
2254  __kmp_internal_fork(loc, gtid, team);
2255  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2256  "master_th=%p, gtid=%d\n",
2257  root, team, master_th, gtid));
2258  }
2259 
2260  if (call_context == fork_context_gnu) {
2261  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2262  return TRUE;
2263  }
2264 
2265  /* Invoke microtask for MASTER thread */
2266  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2267  team->t.t_id, team->t.t_pkfn));
2268  } // END of timer KMP_fork_call block
2269 
2270  if (!team->t.t_invoke(gtid)) {
2271  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2272  }
2273  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2274  team->t.t_id, team->t.t_pkfn));
2275  KMP_MB(); /* Flush all pending memory write invalidates. */
2276 
2277  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2278 
2279 #if OMPT_SUPPORT
2280  if (ompt_enabled.enabled) {
2281  master_th->th.ompt_thread_info.state = omp_state_overhead;
2282  }
2283 #endif
2284 
2285  return TRUE;
2286 }
2287 
2288 #if OMPT_SUPPORT
2289 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2290  kmp_team_t *team) {
2291  // restore state outside the region
2292  thread->th.ompt_thread_info.state =
2293  ((team->t.t_serialized) ? omp_state_work_serial
2294  : omp_state_work_parallel);
2295 }
2296 
2297 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2298  kmp_team_t *team, ompt_data_t *parallel_data,
2299  fork_context_e fork_context, void *codeptr) {
2300  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2301  if (ompt_enabled.ompt_callback_parallel_end) {
2302  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2303  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2304  codeptr);
2305  }
2306 
2307  task_info->frame.enter_frame = NULL;
2308  __kmp_join_restore_state(thread, team);
2309 }
2310 #endif
2311 
2312 void __kmp_join_call(ident_t *loc, int gtid
2313 #if OMPT_SUPPORT
2314  ,
2315  enum fork_context_e fork_context
2316 #endif
2317 #if OMP_40_ENABLED
2318  ,
2319  int exit_teams
2320 #endif /* OMP_40_ENABLED */
2321  ) {
2322  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2323  kmp_team_t *team;
2324  kmp_team_t *parent_team;
2325  kmp_info_t *master_th;
2326  kmp_root_t *root;
2327  int master_active;
2328  int i;
2329 
2330  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2331 
2332  /* setup current data */
2333  master_th = __kmp_threads[gtid];
2334  root = master_th->th.th_root;
2335  team = master_th->th.th_team;
2336  parent_team = team->t.t_parent;
2337 
2338  master_th->th.th_ident = loc;
2339 
2340 #if OMPT_SUPPORT
2341  if (ompt_enabled.enabled) {
2342  master_th->th.ompt_thread_info.state = omp_state_overhead;
2343  }
2344 #endif
2345 
2346 #if KMP_DEBUG
2347  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2348  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2349  "th_task_team = %p\n",
2350  __kmp_gtid_from_thread(master_th), team,
2351  team->t.t_task_team[master_th->th.th_task_state],
2352  master_th->th.th_task_team));
2353  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2354  team->t.t_task_team[master_th->th.th_task_state]);
2355  }
2356 #endif
2357 
2358  if (team->t.t_serialized) {
2359 #if OMP_40_ENABLED
2360  if (master_th->th.th_teams_microtask) {
2361  // We are in teams construct
2362  int level = team->t.t_level;
2363  int tlevel = master_th->th.th_teams_level;
2364  if (level == tlevel) {
2365  // AC: we haven't incremented it earlier at start of teams construct,
2366  // so do it here - at the end of teams construct
2367  team->t.t_level++;
2368  } else if (level == tlevel + 1) {
2369  // AC: we are exiting parallel inside teams, need to increment
2370  // serialization in order to restore it in the next call to
2371  // __kmpc_end_serialized_parallel
2372  team->t.t_serialized++;
2373  }
2374  }
2375 #endif /* OMP_40_ENABLED */
2376  __kmpc_end_serialized_parallel(loc, gtid);
2377 
2378 #if OMPT_SUPPORT
2379  if (ompt_enabled.enabled) {
2380  __kmp_join_restore_state(master_th, parent_team);
2381  }
2382 #endif
2383 
2384  return;
2385  }
2386 
2387  master_active = team->t.t_master_active;
2388 
2389 #if OMP_40_ENABLED
2390  if (!exit_teams)
2391 #endif /* OMP_40_ENABLED */
2392  {
2393  // AC: No barrier for internal teams at exit from teams construct.
2394  // But there is barrier for external team (league).
2395  __kmp_internal_join(loc, gtid, team);
2396  }
2397 #if OMP_40_ENABLED
2398  else {
2399  master_th->th.th_task_state =
2400  0; // AC: no tasking in teams (out of any parallel)
2401  }
2402 #endif /* OMP_40_ENABLED */
2403 
2404  KMP_MB();
2405 
2406 #if OMPT_SUPPORT
2407  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2408  void *codeptr = team->t.ompt_team_info.master_return_address;
2409 #endif
2410 
2411 #if USE_ITT_BUILD
2412  if (__itt_stack_caller_create_ptr) {
2413  __kmp_itt_stack_caller_destroy(
2414  (__itt_caller)team->t
2415  .t_stack_id); // destroy the stack stitching id after join barrier
2416  }
2417 
2418  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2419  if (team->t.t_active_level == 1
2420 #if OMP_40_ENABLED
2421  && !master_th->th.th_teams_microtask /* not in teams construct */
2422 #endif /* OMP_40_ENABLED */
2423  ) {
2424  master_th->th.th_ident = loc;
2425  // only one notification scheme (either "submit" or "forking/joined", not
2426  // both)
2427  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2428  __kmp_forkjoin_frames_mode == 3)
2429  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2430  master_th->th.th_frame_time, 0, loc,
2431  master_th->th.th_team_nproc, 1);
2432  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2433  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2434  __kmp_itt_region_joined(gtid);
2435  } // active_level == 1
2436 #endif /* USE_ITT_BUILD */
2437 
2438 #if OMP_40_ENABLED
2439  if (master_th->th.th_teams_microtask && !exit_teams &&
2440  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2441  team->t.t_level == master_th->th.th_teams_level + 1) {
2442  // AC: We need to leave the team structure intact at the end of parallel
2443  // inside the teams construct, so that at the next parallel same (hot) team
2444  // works, only adjust nesting levels
2445 
2446  /* Decrement our nested depth level */
2447  team->t.t_level--;
2448  team->t.t_active_level--;
2449  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2450 
2451  /* Restore number of threads in the team if needed */
2452  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2453  int old_num = master_th->th.th_team_nproc;
2454  int new_num = master_th->th.th_teams_size.nth;
2455  kmp_info_t **other_threads = team->t.t_threads;
2456  team->t.t_nproc = new_num;
2457  for (i = 0; i < old_num; ++i) {
2458  other_threads[i]->th.th_team_nproc = new_num;
2459  }
2460  // Adjust states of non-used threads of the team
2461  for (i = old_num; i < new_num; ++i) {
2462  // Re-initialize thread's barrier data.
2463  int b;
2464  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2465  for (b = 0; b < bs_last_barrier; ++b) {
2466  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2467  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2468 #if USE_DEBUGGER
2469  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2470 #endif
2471  }
2472  if (__kmp_tasking_mode != tskm_immediate_exec) {
2473  // Synchronize thread's task state
2474  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2475  }
2476  }
2477  }
2478 
2479 #if OMPT_SUPPORT
2480  if (ompt_enabled.enabled) {
2481  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2482  codeptr);
2483  }
2484 #endif
2485 
2486  return;
2487  }
2488 #endif /* OMP_40_ENABLED */
2489 
2490  /* do cleanup and restore the parent team */
2491  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2492  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2493 
2494  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2495 
2496  /* jc: The following lock has instructions with REL and ACQ semantics,
2497  separating the parallel user code called in this parallel region
2498  from the serial user code called after this function returns. */
2499  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2500 
2501 #if OMP_40_ENABLED
2502  if (!master_th->th.th_teams_microtask ||
2503  team->t.t_level > master_th->th.th_teams_level)
2504 #endif /* OMP_40_ENABLED */
2505  {
2506  /* Decrement our nested depth level */
2507  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2508  }
2509  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2510 
2511 #if OMPT_SUPPORT
2512  if (ompt_enabled.enabled) {
2513  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2514  if (ompt_enabled.ompt_callback_implicit_task) {
2515  int ompt_team_size = team->t.t_nproc;
2516  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2517  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2518  OMPT_CUR_TASK_INFO(master_th)->thread_num);
2519  }
2520 
2521  task_info->frame.exit_frame = NULL;
2522  task_info->task_data = ompt_data_none;
2523  }
2524 #endif
2525 
2526  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2527  master_th, team));
2528  __kmp_pop_current_task_from_thread(master_th);
2529 
2530 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2531  // Restore master thread's partition.
2532  master_th->th.th_first_place = team->t.t_first_place;
2533  master_th->th.th_last_place = team->t.t_last_place;
2534 #endif /* OMP_40_ENABLED */
2535 #if OMP_50_ENABLED
2536  master_th->th.th_def_allocator = team->t.t_def_allocator;
2537 #endif
2538 
2539  updateHWFPControl(team);
2540 
2541  if (root->r.r_active != master_active)
2542  root->r.r_active = master_active;
2543 
2544  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2545  master_th)); // this will free worker threads
2546 
2547  /* this race was fun to find. make sure the following is in the critical
2548  region otherwise assertions may fail occasionally since the old team may be
2549  reallocated and the hierarchy appears inconsistent. it is actually safe to
2550  run and won't cause any bugs, but will cause those assertion failures. it's
2551  only one deref&assign so might as well put this in the critical region */
2552  master_th->th.th_team = parent_team;
2553  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2554  master_th->th.th_team_master = parent_team->t.t_threads[0];
2555  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2556 
2557  /* restore serialized team, if need be */
2558  if (parent_team->t.t_serialized &&
2559  parent_team != master_th->th.th_serial_team &&
2560  parent_team != root->r.r_root_team) {
2561  __kmp_free_team(root,
2562  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2563  master_th->th.th_serial_team = parent_team;
2564  }
2565 
2566  if (__kmp_tasking_mode != tskm_immediate_exec) {
2567  if (master_th->th.th_task_state_top >
2568  0) { // Restore task state from memo stack
2569  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2570  // Remember master's state if we re-use this nested hot team
2571  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2572  master_th->th.th_task_state;
2573  --master_th->th.th_task_state_top; // pop
2574  // Now restore state at this level
2575  master_th->th.th_task_state =
2576  master_th->th
2577  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2578  }
2579  // Copy the task team from the parent team to the master thread
2580  master_th->th.th_task_team =
2581  parent_team->t.t_task_team[master_th->th.th_task_state];
2582  KA_TRACE(20,
2583  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2584  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2585  parent_team));
2586  }
2587 
2588  // TODO: GEH - cannot do this assertion because root thread not set up as
2589  // executing
2590  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2591  master_th->th.th_current_task->td_flags.executing = 1;
2592 
2593  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2594 
2595 #if OMPT_SUPPORT
2596  if (ompt_enabled.enabled) {
2597  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2598  codeptr);
2599  }
2600 #endif
2601 
2602  KMP_MB();
2603  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2604 }
2605 
2606 /* Check whether we should push an internal control record onto the
2607  serial team stack. If so, do it. */
2608 void __kmp_save_internal_controls(kmp_info_t *thread) {
2609 
2610  if (thread->th.th_team != thread->th.th_serial_team) {
2611  return;
2612  }
2613  if (thread->th.th_team->t.t_serialized > 1) {
2614  int push = 0;
2615 
2616  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2617  push = 1;
2618  } else {
2619  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2620  thread->th.th_team->t.t_serialized) {
2621  push = 1;
2622  }
2623  }
2624  if (push) { /* push a record on the serial team's stack */
2625  kmp_internal_control_t *control =
2626  (kmp_internal_control_t *)__kmp_allocate(
2627  sizeof(kmp_internal_control_t));
2628 
2629  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2630 
2631  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2632 
2633  control->next = thread->th.th_team->t.t_control_stack_top;
2634  thread->th.th_team->t.t_control_stack_top = control;
2635  }
2636  }
2637 }
2638 
2639 /* Changes set_nproc */
2640 void __kmp_set_num_threads(int new_nth, int gtid) {
2641  kmp_info_t *thread;
2642  kmp_root_t *root;
2643 
2644  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2645  KMP_DEBUG_ASSERT(__kmp_init_serial);
2646 
2647  if (new_nth < 1)
2648  new_nth = 1;
2649  else if (new_nth > __kmp_max_nth)
2650  new_nth = __kmp_max_nth;
2651 
2652  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2653  thread = __kmp_threads[gtid];
2654 
2655  __kmp_save_internal_controls(thread);
2656 
2657  set__nproc(thread, new_nth);
2658 
2659  // If this omp_set_num_threads() call will cause the hot team size to be
2660  // reduced (in the absence of a num_threads clause), then reduce it now,
2661  // rather than waiting for the next parallel region.
2662  root = thread->th.th_root;
2663  if (__kmp_init_parallel && (!root->r.r_active) &&
2664  (root->r.r_hot_team->t.t_nproc > new_nth)
2665 #if KMP_NESTED_HOT_TEAMS
2666  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2667 #endif
2668  ) {
2669  kmp_team_t *hot_team = root->r.r_hot_team;
2670  int f;
2671 
2672  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2673 
2674  // Release the extra threads we don't need any more.
2675  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2676  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2677  if (__kmp_tasking_mode != tskm_immediate_exec) {
2678  // When decreasing team size, threads no longer in the team should unref
2679  // task team.
2680  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2681  }
2682  __kmp_free_thread(hot_team->t.t_threads[f]);
2683  hot_team->t.t_threads[f] = NULL;
2684  }
2685  hot_team->t.t_nproc = new_nth;
2686 #if KMP_NESTED_HOT_TEAMS
2687  if (thread->th.th_hot_teams) {
2688  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2689  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2690  }
2691 #endif
2692 
2693  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2694 
2695  // Update the t_nproc field in the threads that are still active.
2696  for (f = 0; f < new_nth; f++) {
2697  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2698  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2699  }
2700  // Special flag in case omp_set_num_threads() call
2701  hot_team->t.t_size_changed = -1;
2702  }
2703 }
2704 
2705 /* Changes max_active_levels */
2706 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2707  kmp_info_t *thread;
2708 
2709  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2710  "%d = (%d)\n",
2711  gtid, max_active_levels));
2712  KMP_DEBUG_ASSERT(__kmp_init_serial);
2713 
2714  // validate max_active_levels
2715  if (max_active_levels < 0) {
2716  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2717  // We ignore this call if the user has specified a negative value.
2718  // The current setting won't be changed. The last valid setting will be
2719  // used. A warning will be issued (if warnings are allowed as controlled by
2720  // the KMP_WARNINGS env var).
2721  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2722  "max_active_levels for thread %d = (%d)\n",
2723  gtid, max_active_levels));
2724  return;
2725  }
2726  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2727  // it's OK, the max_active_levels is within the valid range: [ 0;
2728  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2729  // We allow a zero value. (implementation defined behavior)
2730  } else {
2731  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2732  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2733  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2734  // Current upper limit is MAX_INT. (implementation defined behavior)
2735  // If the input exceeds the upper limit, we correct the input to be the
2736  // upper limit. (implementation defined behavior)
2737  // Actually, the flow should never get here until we use MAX_INT limit.
2738  }
2739  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2740  "max_active_levels for thread %d = (%d)\n",
2741  gtid, max_active_levels));
2742 
2743  thread = __kmp_threads[gtid];
2744 
2745  __kmp_save_internal_controls(thread);
2746 
2747  set__max_active_levels(thread, max_active_levels);
2748 }
2749 
2750 /* Gets max_active_levels */
2751 int __kmp_get_max_active_levels(int gtid) {
2752  kmp_info_t *thread;
2753 
2754  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2755  KMP_DEBUG_ASSERT(__kmp_init_serial);
2756 
2757  thread = __kmp_threads[gtid];
2758  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2759  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2760  "curtask_maxaclevel=%d\n",
2761  gtid, thread->th.th_current_task,
2762  thread->th.th_current_task->td_icvs.max_active_levels));
2763  return thread->th.th_current_task->td_icvs.max_active_levels;
2764 }
2765 
2766 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2767 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2768  kmp_info_t *thread;
2769  // kmp_team_t *team;
2770 
2771  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2772  gtid, (int)kind, chunk));
2773  KMP_DEBUG_ASSERT(__kmp_init_serial);
2774 
2775  // Check if the kind parameter is valid, correct if needed.
2776  // Valid parameters should fit in one of two intervals - standard or extended:
2777  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2778  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2779  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2780  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2781  // TODO: Hint needs attention in case we change the default schedule.
2782  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2783  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2784  __kmp_msg_null);
2785  kind = kmp_sched_default;
2786  chunk = 0; // ignore chunk value in case of bad kind
2787  }
2788 
2789  thread = __kmp_threads[gtid];
2790 
2791  __kmp_save_internal_controls(thread);
2792 
2793  if (kind < kmp_sched_upper_std) {
2794  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2795  // differ static chunked vs. unchunked: chunk should be invalid to
2796  // indicate unchunked schedule (which is the default)
2797  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2798  } else {
2799  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2800  __kmp_sch_map[kind - kmp_sched_lower - 1];
2801  }
2802  } else {
2803  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2804  // kmp_sched_lower - 2 ];
2805  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2806  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2807  kmp_sched_lower - 2];
2808  }
2809  if (kind == kmp_sched_auto || chunk < 1) {
2810  // ignore parameter chunk for schedule auto
2811  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2812  } else {
2813  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2814  }
2815 }
2816 
2817 /* Gets def_sched_var ICV values */
2818 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2819  kmp_info_t *thread;
2820  enum sched_type th_type;
2821 
2822  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2823  KMP_DEBUG_ASSERT(__kmp_init_serial);
2824 
2825  thread = __kmp_threads[gtid];
2826 
2827  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2828 
2829  switch (th_type) {
2830  case kmp_sch_static:
2831  case kmp_sch_static_greedy:
2832  case kmp_sch_static_balanced:
2833  *kind = kmp_sched_static;
2834  *chunk = 0; // chunk was not set, try to show this fact via zero value
2835  return;
2836  case kmp_sch_static_chunked:
2837  *kind = kmp_sched_static;
2838  break;
2839  case kmp_sch_dynamic_chunked:
2840  *kind = kmp_sched_dynamic;
2841  break;
2843  case kmp_sch_guided_iterative_chunked:
2844  case kmp_sch_guided_analytical_chunked:
2845  *kind = kmp_sched_guided;
2846  break;
2847  case kmp_sch_auto:
2848  *kind = kmp_sched_auto;
2849  break;
2850  case kmp_sch_trapezoidal:
2851  *kind = kmp_sched_trapezoidal;
2852  break;
2853 #if KMP_STATIC_STEAL_ENABLED
2854  case kmp_sch_static_steal:
2855  *kind = kmp_sched_static_steal;
2856  break;
2857 #endif
2858  default:
2859  KMP_FATAL(UnknownSchedulingType, th_type);
2860  }
2861 
2862  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2863 }
2864 
2865 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2866 
2867  int ii, dd;
2868  kmp_team_t *team;
2869  kmp_info_t *thr;
2870 
2871  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2872  KMP_DEBUG_ASSERT(__kmp_init_serial);
2873 
2874  // validate level
2875  if (level == 0)
2876  return 0;
2877  if (level < 0)
2878  return -1;
2879  thr = __kmp_threads[gtid];
2880  team = thr->th.th_team;
2881  ii = team->t.t_level;
2882  if (level > ii)
2883  return -1;
2884 
2885 #if OMP_40_ENABLED
2886  if (thr->th.th_teams_microtask) {
2887  // AC: we are in teams region where multiple nested teams have same level
2888  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2889  if (level <=
2890  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2891  KMP_DEBUG_ASSERT(ii >= tlevel);
2892  // AC: As we need to pass by the teams league, we need to artificially
2893  // increase ii
2894  if (ii == tlevel) {
2895  ii += 2; // three teams have same level
2896  } else {
2897  ii++; // two teams have same level
2898  }
2899  }
2900  }
2901 #endif
2902 
2903  if (ii == level)
2904  return __kmp_tid_from_gtid(gtid);
2905 
2906  dd = team->t.t_serialized;
2907  level++;
2908  while (ii > level) {
2909  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2910  }
2911  if ((team->t.t_serialized) && (!dd)) {
2912  team = team->t.t_parent;
2913  continue;
2914  }
2915  if (ii > level) {
2916  team = team->t.t_parent;
2917  dd = team->t.t_serialized;
2918  ii--;
2919  }
2920  }
2921 
2922  return (dd > 1) ? (0) : (team->t.t_master_tid);
2923 }
2924 
2925 int __kmp_get_team_size(int gtid, int level) {
2926 
2927  int ii, dd;
2928  kmp_team_t *team;
2929  kmp_info_t *thr;
2930 
2931  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2932  KMP_DEBUG_ASSERT(__kmp_init_serial);
2933 
2934  // validate level
2935  if (level == 0)
2936  return 1;
2937  if (level < 0)
2938  return -1;
2939  thr = __kmp_threads[gtid];
2940  team = thr->th.th_team;
2941  ii = team->t.t_level;
2942  if (level > ii)
2943  return -1;
2944 
2945 #if OMP_40_ENABLED
2946  if (thr->th.th_teams_microtask) {
2947  // AC: we are in teams region where multiple nested teams have same level
2948  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2949  if (level <=
2950  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2951  KMP_DEBUG_ASSERT(ii >= tlevel);
2952  // AC: As we need to pass by the teams league, we need to artificially
2953  // increase ii
2954  if (ii == tlevel) {
2955  ii += 2; // three teams have same level
2956  } else {
2957  ii++; // two teams have same level
2958  }
2959  }
2960  }
2961 #endif
2962 
2963  while (ii > level) {
2964  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2965  }
2966  if (team->t.t_serialized && (!dd)) {
2967  team = team->t.t_parent;
2968  continue;
2969  }
2970  if (ii > level) {
2971  team = team->t.t_parent;
2972  ii--;
2973  }
2974  }
2975 
2976  return team->t.t_nproc;
2977 }
2978 
2979 kmp_r_sched_t __kmp_get_schedule_global() {
2980  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2981  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2982  // independently. So one can get the updated schedule here.
2983 
2984  kmp_r_sched_t r_sched;
2985 
2986  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2987  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2988  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2989  // different roots (even in OMP 2.5)
2990  if (__kmp_sched == kmp_sch_static) {
2991  // replace STATIC with more detailed schedule (balanced or greedy)
2992  r_sched.r_sched_type = __kmp_static;
2993  } else if (__kmp_sched == kmp_sch_guided_chunked) {
2994  // replace GUIDED with more detailed schedule (iterative or analytical)
2995  r_sched.r_sched_type = __kmp_guided;
2996  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2997  r_sched.r_sched_type = __kmp_sched;
2998  }
2999 
3000  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3001  // __kmp_chunk may be wrong here (if it was not ever set)
3002  r_sched.chunk = KMP_DEFAULT_CHUNK;
3003  } else {
3004  r_sched.chunk = __kmp_chunk;
3005  }
3006 
3007  return r_sched;
3008 }
3009 
3010 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3011  at least argc number of *t_argv entries for the requested team. */
3012 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3013 
3014  KMP_DEBUG_ASSERT(team);
3015  if (!realloc || argc > team->t.t_max_argc) {
3016 
3017  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3018  "current entries=%d\n",
3019  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3020  /* if previously allocated heap space for args, free them */
3021  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3022  __kmp_free((void *)team->t.t_argv);
3023 
3024  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3025  /* use unused space in the cache line for arguments */
3026  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3027  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3028  "argv entries\n",
3029  team->t.t_id, team->t.t_max_argc));
3030  team->t.t_argv = &team->t.t_inline_argv[0];
3031  if (__kmp_storage_map) {
3032  __kmp_print_storage_map_gtid(
3033  -1, &team->t.t_inline_argv[0],
3034  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3035  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3036  team->t.t_id);
3037  }
3038  } else {
3039  /* allocate space for arguments in the heap */
3040  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3041  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3042  : 2 * argc;
3043  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3044  "argv entries\n",
3045  team->t.t_id, team->t.t_max_argc));
3046  team->t.t_argv =
3047  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3048  if (__kmp_storage_map) {
3049  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3050  &team->t.t_argv[team->t.t_max_argc],
3051  sizeof(void *) * team->t.t_max_argc,
3052  "team_%d.t_argv", team->t.t_id);
3053  }
3054  }
3055  }
3056 }
3057 
3058 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3059  int i;
3060  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3061  team->t.t_threads =
3062  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3063  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3064  sizeof(dispatch_shared_info_t) * num_disp_buff);
3065  team->t.t_dispatch =
3066  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3067  team->t.t_implicit_task_taskdata =
3068  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3069  team->t.t_max_nproc = max_nth;
3070 
3071  /* setup dispatch buffers */
3072  for (i = 0; i < num_disp_buff; ++i) {
3073  team->t.t_disp_buffer[i].buffer_index = i;
3074 #if OMP_45_ENABLED
3075  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3076 #endif
3077  }
3078 }
3079 
3080 static void __kmp_free_team_arrays(kmp_team_t *team) {
3081  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3082  int i;
3083  for (i = 0; i < team->t.t_max_nproc; ++i) {
3084  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3085  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3086  team->t.t_dispatch[i].th_disp_buffer = NULL;
3087  }
3088  }
3089 #if KMP_USE_HIER_SCHED
3090  __kmp_dispatch_free_hierarchies(team);
3091 #endif
3092  __kmp_free(team->t.t_threads);
3093  __kmp_free(team->t.t_disp_buffer);
3094  __kmp_free(team->t.t_dispatch);
3095  __kmp_free(team->t.t_implicit_task_taskdata);
3096  team->t.t_threads = NULL;
3097  team->t.t_disp_buffer = NULL;
3098  team->t.t_dispatch = NULL;
3099  team->t.t_implicit_task_taskdata = 0;
3100 }
3101 
3102 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3103  kmp_info_t **oldThreads = team->t.t_threads;
3104 
3105  __kmp_free(team->t.t_disp_buffer);
3106  __kmp_free(team->t.t_dispatch);
3107  __kmp_free(team->t.t_implicit_task_taskdata);
3108  __kmp_allocate_team_arrays(team, max_nth);
3109 
3110  KMP_MEMCPY(team->t.t_threads, oldThreads,
3111  team->t.t_nproc * sizeof(kmp_info_t *));
3112 
3113  __kmp_free(oldThreads);
3114 }
3115 
3116 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3117 
3118  kmp_r_sched_t r_sched =
3119  __kmp_get_schedule_global(); // get current state of scheduling globals
3120 
3121 #if OMP_40_ENABLED
3122  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3123 #endif /* OMP_40_ENABLED */
3124 
3125  kmp_internal_control_t g_icvs = {
3126  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3127  (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3128  // for nested parallelism (per thread)
3129  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3130  // adjustment of threads (per thread)
3131  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3132  // whether blocktime is explicitly set
3133  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3134 #if KMP_USE_MONITOR
3135  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3136 // intervals
3137 #endif
3138  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3139  // next parallel region (per thread)
3140  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3141  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3142  // for max_active_levels
3143  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3144 // {sched,chunk} pair
3145 #if OMP_40_ENABLED
3146  __kmp_nested_proc_bind.bind_types[0],
3147  __kmp_default_device,
3148 #endif /* OMP_40_ENABLED */
3149  NULL // struct kmp_internal_control *next;
3150  };
3151 
3152  return g_icvs;
3153 }
3154 
3155 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3156 
3157  kmp_internal_control_t gx_icvs;
3158  gx_icvs.serial_nesting_level =
3159  0; // probably =team->t.t_serial like in save_inter_controls
3160  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3161  gx_icvs.next = NULL;
3162 
3163  return gx_icvs;
3164 }
3165 
3166 static void __kmp_initialize_root(kmp_root_t *root) {
3167  int f;
3168  kmp_team_t *root_team;
3169  kmp_team_t *hot_team;
3170  int hot_team_max_nth;
3171  kmp_r_sched_t r_sched =
3172  __kmp_get_schedule_global(); // get current state of scheduling globals
3173  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3174  KMP_DEBUG_ASSERT(root);
3175  KMP_ASSERT(!root->r.r_begin);
3176 
3177  /* setup the root state structure */
3178  __kmp_init_lock(&root->r.r_begin_lock);
3179  root->r.r_begin = FALSE;
3180  root->r.r_active = FALSE;
3181  root->r.r_in_parallel = 0;
3182  root->r.r_blocktime = __kmp_dflt_blocktime;
3183  root->r.r_nested = __kmp_dflt_nested;
3184  root->r.r_cg_nthreads = 1;
3185 
3186  /* setup the root team for this task */
3187  /* allocate the root team structure */
3188  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3189 
3190  root_team =
3191  __kmp_allocate_team(root,
3192  1, // new_nproc
3193  1, // max_nproc
3194 #if OMPT_SUPPORT
3195  ompt_data_none, // root parallel id
3196 #endif
3197 #if OMP_40_ENABLED
3198  __kmp_nested_proc_bind.bind_types[0],
3199 #endif
3200  &r_icvs,
3201  0 // argc
3202  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3203  );
3204 #if USE_DEBUGGER
3205  // Non-NULL value should be assigned to make the debugger display the root
3206  // team.
3207  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3208 #endif
3209 
3210  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3211 
3212  root->r.r_root_team = root_team;
3213  root_team->t.t_control_stack_top = NULL;
3214 
3215  /* initialize root team */
3216  root_team->t.t_threads[0] = NULL;
3217  root_team->t.t_nproc = 1;
3218  root_team->t.t_serialized = 1;
3219  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3220  root_team->t.t_sched.sched = r_sched.sched;
3221  KA_TRACE(
3222  20,
3223  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3224  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3225 
3226  /* setup the hot team for this task */
3227  /* allocate the hot team structure */
3228  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3229 
3230  hot_team =
3231  __kmp_allocate_team(root,
3232  1, // new_nproc
3233  __kmp_dflt_team_nth_ub * 2, // max_nproc
3234 #if OMPT_SUPPORT
3235  ompt_data_none, // root parallel id
3236 #endif
3237 #if OMP_40_ENABLED
3238  __kmp_nested_proc_bind.bind_types[0],
3239 #endif
3240  &r_icvs,
3241  0 // argc
3242  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3243  );
3244  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3245 
3246  root->r.r_hot_team = hot_team;
3247  root_team->t.t_control_stack_top = NULL;
3248 
3249  /* first-time initialization */
3250  hot_team->t.t_parent = root_team;
3251 
3252  /* initialize hot team */
3253  hot_team_max_nth = hot_team->t.t_max_nproc;
3254  for (f = 0; f < hot_team_max_nth; ++f) {
3255  hot_team->t.t_threads[f] = NULL;
3256  }
3257  hot_team->t.t_nproc = 1;
3258  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3259  hot_team->t.t_sched.sched = r_sched.sched;
3260  hot_team->t.t_size_changed = 0;
3261 }
3262 
3263 #ifdef KMP_DEBUG
3264 
3265 typedef struct kmp_team_list_item {
3266  kmp_team_p const *entry;
3267  struct kmp_team_list_item *next;
3268 } kmp_team_list_item_t;
3269 typedef kmp_team_list_item_t *kmp_team_list_t;
3270 
3271 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3272  kmp_team_list_t list, // List of teams.
3273  kmp_team_p const *team // Team to add.
3274  ) {
3275 
3276  // List must terminate with item where both entry and next are NULL.
3277  // Team is added to the list only once.
3278  // List is sorted in ascending order by team id.
3279  // Team id is *not* a key.
3280 
3281  kmp_team_list_t l;
3282 
3283  KMP_DEBUG_ASSERT(list != NULL);
3284  if (team == NULL) {
3285  return;
3286  }
3287 
3288  __kmp_print_structure_team_accum(list, team->t.t_parent);
3289  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3290 
3291  // Search list for the team.
3292  l = list;
3293  while (l->next != NULL && l->entry != team) {
3294  l = l->next;
3295  }
3296  if (l->next != NULL) {
3297  return; // Team has been added before, exit.
3298  }
3299 
3300  // Team is not found. Search list again for insertion point.
3301  l = list;
3302  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3303  l = l->next;
3304  }
3305 
3306  // Insert team.
3307  {
3308  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3309  sizeof(kmp_team_list_item_t));
3310  *item = *l;
3311  l->entry = team;
3312  l->next = item;
3313  }
3314 }
3315 
3316 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3317 
3318  ) {
3319  __kmp_printf("%s", title);
3320  if (team != NULL) {
3321  __kmp_printf("%2x %p\n", team->t.t_id, team);
3322  } else {
3323  __kmp_printf(" - (nil)\n");
3324  }
3325 }
3326 
3327 static void __kmp_print_structure_thread(char const *title,
3328  kmp_info_p const *thread) {
3329  __kmp_printf("%s", title);
3330  if (thread != NULL) {
3331  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3332  } else {
3333  __kmp_printf(" - (nil)\n");
3334  }
3335 }
3336 
3337 void __kmp_print_structure(void) {
3338 
3339  kmp_team_list_t list;
3340 
3341  // Initialize list of teams.
3342  list =
3343  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3344  list->entry = NULL;
3345  list->next = NULL;
3346 
3347  __kmp_printf("\n------------------------------\nGlobal Thread "
3348  "Table\n------------------------------\n");
3349  {
3350  int gtid;
3351  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3352  __kmp_printf("%2d", gtid);
3353  if (__kmp_threads != NULL) {
3354  __kmp_printf(" %p", __kmp_threads[gtid]);
3355  }
3356  if (__kmp_root != NULL) {
3357  __kmp_printf(" %p", __kmp_root[gtid]);
3358  }
3359  __kmp_printf("\n");
3360  }
3361  }
3362 
3363  // Print out __kmp_threads array.
3364  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3365  "----------\n");
3366  if (__kmp_threads != NULL) {
3367  int gtid;
3368  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3369  kmp_info_t const *thread = __kmp_threads[gtid];
3370  if (thread != NULL) {
3371  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3372  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3373  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3374  __kmp_print_structure_team(" Serial Team: ",
3375  thread->th.th_serial_team);
3376  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3377  __kmp_print_structure_thread(" Master: ",
3378  thread->th.th_team_master);
3379  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3380  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3381 #if OMP_40_ENABLED
3382  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3383 #endif
3384  __kmp_print_structure_thread(" Next in pool: ",
3385  thread->th.th_next_pool);
3386  __kmp_printf("\n");
3387  __kmp_print_structure_team_accum(list, thread->th.th_team);
3388  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3389  }
3390  }
3391  } else {
3392  __kmp_printf("Threads array is not allocated.\n");
3393  }
3394 
3395  // Print out __kmp_root array.
3396  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3397  "--------\n");
3398  if (__kmp_root != NULL) {
3399  int gtid;
3400  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3401  kmp_root_t const *root = __kmp_root[gtid];
3402  if (root != NULL) {
3403  __kmp_printf("GTID %2d %p:\n", gtid, root);
3404  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3405  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3406  __kmp_print_structure_thread(" Uber Thread: ",
3407  root->r.r_uber_thread);
3408  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3409  __kmp_printf(" Nested?: %2d\n", root->r.r_nested);
3410  __kmp_printf(" In Parallel: %2d\n",
3411  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3412  __kmp_printf("\n");
3413  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3414  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3415  }
3416  }
3417  } else {
3418  __kmp_printf("Ubers array is not allocated.\n");
3419  }
3420 
3421  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3422  "--------\n");
3423  while (list->next != NULL) {
3424  kmp_team_p const *team = list->entry;
3425  int i;
3426  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3427  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3428  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3429  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3430  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3431  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3432  for (i = 0; i < team->t.t_nproc; ++i) {
3433  __kmp_printf(" Thread %2d: ", i);
3434  __kmp_print_structure_thread("", team->t.t_threads[i]);
3435  }
3436  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3437  __kmp_printf("\n");
3438  list = list->next;
3439  }
3440 
3441  // Print out __kmp_thread_pool and __kmp_team_pool.
3442  __kmp_printf("\n------------------------------\nPools\n----------------------"
3443  "--------\n");
3444  __kmp_print_structure_thread("Thread pool: ",
3445  CCAST(kmp_info_t *, __kmp_thread_pool));
3446  __kmp_print_structure_team("Team pool: ",
3447  CCAST(kmp_team_t *, __kmp_team_pool));
3448  __kmp_printf("\n");
3449 
3450  // Free team list.
3451  while (list != NULL) {
3452  kmp_team_list_item_t *item = list;
3453  list = list->next;
3454  KMP_INTERNAL_FREE(item);
3455  }
3456 }
3457 
3458 #endif
3459 
3460 //---------------------------------------------------------------------------
3461 // Stuff for per-thread fast random number generator
3462 // Table of primes
3463 static const unsigned __kmp_primes[] = {
3464  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3465  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3466  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3467  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3468  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3469  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3470  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3471  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3472  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3473  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3474  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3475 
3476 //---------------------------------------------------------------------------
3477 // __kmp_get_random: Get a random number using a linear congruential method.
3478 unsigned short __kmp_get_random(kmp_info_t *thread) {
3479  unsigned x = thread->th.th_x;
3480  unsigned short r = x >> 16;
3481 
3482  thread->th.th_x = x * thread->th.th_a + 1;
3483 
3484  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3485  thread->th.th_info.ds.ds_tid, r));
3486 
3487  return r;
3488 }
3489 //--------------------------------------------------------
3490 // __kmp_init_random: Initialize a random number generator
3491 void __kmp_init_random(kmp_info_t *thread) {
3492  unsigned seed = thread->th.th_info.ds.ds_tid;
3493 
3494  thread->th.th_a =
3495  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3496  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3497  KA_TRACE(30,
3498  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3499 }
3500 
3501 #if KMP_OS_WINDOWS
3502 /* reclaim array entries for root threads that are already dead, returns number
3503  * reclaimed */
3504 static int __kmp_reclaim_dead_roots(void) {
3505  int i, r = 0;
3506 
3507  for (i = 0; i < __kmp_threads_capacity; ++i) {
3508  if (KMP_UBER_GTID(i) &&
3509  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3510  !__kmp_root[i]
3511  ->r.r_active) { // AC: reclaim only roots died in non-active state
3512  r += __kmp_unregister_root_other_thread(i);
3513  }
3514  }
3515  return r;
3516 }
3517 #endif
3518 
3519 /* This function attempts to create free entries in __kmp_threads and
3520  __kmp_root, and returns the number of free entries generated.
3521 
3522  For Windows* OS static library, the first mechanism used is to reclaim array
3523  entries for root threads that are already dead.
3524 
3525  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3526  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3527  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3528  threadprivate cache array has been created. Synchronization with
3529  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3530 
3531  After any dead root reclamation, if the clipping value allows array expansion
3532  to result in the generation of a total of nNeed free slots, the function does
3533  that expansion. If not, nothing is done beyond the possible initial root
3534  thread reclamation.
3535 
3536  If any argument is negative, the behavior is undefined. */
3537 static int __kmp_expand_threads(int nNeed) {
3538  int added = 0;
3539  int minimumRequiredCapacity;
3540  int newCapacity;
3541  kmp_info_t **newThreads;
3542  kmp_root_t **newRoot;
3543 
3544 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3545 // resizing __kmp_threads does not need additional protection if foreign
3546 // threads are present
3547 
3548 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3549  /* only for Windows static library */
3550  /* reclaim array entries for root threads that are already dead */
3551  added = __kmp_reclaim_dead_roots();
3552 
3553  if (nNeed) {
3554  nNeed -= added;
3555  if (nNeed < 0)
3556  nNeed = 0;
3557  }
3558 #endif
3559  if (nNeed <= 0)
3560  return added;
3561 
3562  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3563  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3564  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3565  // > __kmp_max_nth in one of two ways:
3566  //
3567  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3568  // may not be resused by another thread, so we may need to increase
3569  // __kmp_threads_capacity to __kmp_max_nth + 1.
3570  //
3571  // 2) New foreign root(s) are encountered. We always register new foreign
3572  // roots. This may cause a smaller # of threads to be allocated at
3573  // subsequent parallel regions, but the worker threads hang around (and
3574  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3575  //
3576  // Anyway, that is the reason for moving the check to see if
3577  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3578  // instead of having it performed here. -BB
3579 
3580  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3581 
3582  /* compute expansion headroom to check if we can expand */
3583  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3584  /* possible expansion too small -- give up */
3585  return added;
3586  }
3587  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3588 
3589  newCapacity = __kmp_threads_capacity;
3590  do {
3591  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3592  : __kmp_sys_max_nth;
3593  } while (newCapacity < minimumRequiredCapacity);
3594  newThreads = (kmp_info_t **)__kmp_allocate(
3595  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3596  newRoot =
3597  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3598  KMP_MEMCPY(newThreads, __kmp_threads,
3599  __kmp_threads_capacity * sizeof(kmp_info_t *));
3600  KMP_MEMCPY(newRoot, __kmp_root,
3601  __kmp_threads_capacity * sizeof(kmp_root_t *));
3602 
3603  kmp_info_t **temp_threads = __kmp_threads;
3604  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3605  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3606  __kmp_free(temp_threads);
3607  added += newCapacity - __kmp_threads_capacity;
3608  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3609 
3610  if (newCapacity > __kmp_tp_capacity) {
3611  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3612  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3613  __kmp_threadprivate_resize_cache(newCapacity);
3614  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3615  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3616  }
3617  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3618  }
3619 
3620  return added;
3621 }
3622 
3623 /* Register the current thread as a root thread and obtain our gtid. We must
3624  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3625  thread that calls from __kmp_do_serial_initialize() */
3626 int __kmp_register_root(int initial_thread) {
3627  kmp_info_t *root_thread;
3628  kmp_root_t *root;
3629  int gtid;
3630  int capacity;
3631  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3632  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3633  KMP_MB();
3634 
3635  /* 2007-03-02:
3636  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3637  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3638  work as expected -- it may return false (that means there is at least one
3639  empty slot in __kmp_threads array), but it is possible the only free slot
3640  is #0, which is reserved for initial thread and so cannot be used for this
3641  one. Following code workarounds this bug.
3642 
3643  However, right solution seems to be not reserving slot #0 for initial
3644  thread because:
3645  (1) there is no magic in slot #0,
3646  (2) we cannot detect initial thread reliably (the first thread which does
3647  serial initialization may be not a real initial thread).
3648  */
3649  capacity = __kmp_threads_capacity;
3650  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3651  --capacity;
3652  }
3653 
3654  /* see if there are too many threads */
3655  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3656  if (__kmp_tp_cached) {
3657  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3658  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3659  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3660  } else {
3661  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3662  __kmp_msg_null);
3663  }
3664  }
3665 
3666  /* find an available thread slot */
3667  /* Don't reassign the zero slot since we need that to only be used by initial
3668  thread */
3669  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3670  gtid++)
3671  ;
3672  KA_TRACE(1,
3673  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3674  KMP_ASSERT(gtid < __kmp_threads_capacity);
3675 
3676  /* update global accounting */
3677  __kmp_all_nth++;
3678  TCW_4(__kmp_nth, __kmp_nth + 1);
3679 
3680  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3681  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3682  if (__kmp_adjust_gtid_mode) {
3683  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3684  if (TCR_4(__kmp_gtid_mode) != 2) {
3685  TCW_4(__kmp_gtid_mode, 2);
3686  }
3687  } else {
3688  if (TCR_4(__kmp_gtid_mode) != 1) {
3689  TCW_4(__kmp_gtid_mode, 1);
3690  }
3691  }
3692  }
3693 
3694 #ifdef KMP_ADJUST_BLOCKTIME
3695  /* Adjust blocktime to zero if necessary */
3696  /* Middle initialization might not have occurred yet */
3697  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3698  if (__kmp_nth > __kmp_avail_proc) {
3699  __kmp_zero_bt = TRUE;
3700  }
3701  }
3702 #endif /* KMP_ADJUST_BLOCKTIME */
3703 
3704  /* setup this new hierarchy */
3705  if (!(root = __kmp_root[gtid])) {
3706  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3707  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3708  }
3709 
3710 #if KMP_STATS_ENABLED
3711  // Initialize stats as soon as possible (right after gtid assignment).
3712  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3713  __kmp_stats_thread_ptr->startLife();
3714  KMP_SET_THREAD_STATE(SERIAL_REGION);
3715  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3716 #endif
3717  __kmp_initialize_root(root);
3718 
3719  /* setup new root thread structure */
3720  if (root->r.r_uber_thread) {
3721  root_thread = root->r.r_uber_thread;
3722  } else {
3723  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3724  if (__kmp_storage_map) {
3725  __kmp_print_thread_storage_map(root_thread, gtid);
3726  }
3727  root_thread->th.th_info.ds.ds_gtid = gtid;
3728 #if OMPT_SUPPORT
3729  root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3730 #endif
3731  root_thread->th.th_root = root;
3732  if (__kmp_env_consistency_check) {
3733  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3734  }
3735 #if USE_FAST_MEMORY
3736  __kmp_initialize_fast_memory(root_thread);
3737 #endif /* USE_FAST_MEMORY */
3738 
3739 #if KMP_USE_BGET
3740  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3741  __kmp_initialize_bget(root_thread);
3742 #endif
3743  __kmp_init_random(root_thread); // Initialize random number generator
3744  }
3745 
3746  /* setup the serial team held in reserve by the root thread */
3747  if (!root_thread->th.th_serial_team) {
3748  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3749  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3750  root_thread->th.th_serial_team =
3751  __kmp_allocate_team(root, 1, 1,
3752 #if OMPT_SUPPORT
3753  ompt_data_none, // root parallel id
3754 #endif
3755 #if OMP_40_ENABLED
3756  proc_bind_default,
3757 #endif
3758  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3759  }
3760  KMP_ASSERT(root_thread->th.th_serial_team);
3761  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3762  root_thread->th.th_serial_team));
3763 
3764  /* drop root_thread into place */
3765  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3766 
3767  root->r.r_root_team->t.t_threads[0] = root_thread;
3768  root->r.r_hot_team->t.t_threads[0] = root_thread;
3769  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3770  // AC: the team created in reserve, not for execution (it is unused for now).
3771  root_thread->th.th_serial_team->t.t_serialized = 0;
3772  root->r.r_uber_thread = root_thread;
3773 
3774  /* initialize the thread, get it ready to go */
3775  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3776  TCW_4(__kmp_init_gtid, TRUE);
3777 
3778  /* prepare the master thread for get_gtid() */
3779  __kmp_gtid_set_specific(gtid);
3780 
3781 #if USE_ITT_BUILD
3782  __kmp_itt_thread_name(gtid);
3783 #endif /* USE_ITT_BUILD */
3784 
3785 #ifdef KMP_TDATA_GTID
3786  __kmp_gtid = gtid;
3787 #endif
3788  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3789  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3790 
3791  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3792  "plain=%u\n",
3793  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3794  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3795  KMP_INIT_BARRIER_STATE));
3796  { // Initialize barrier data.
3797  int b;
3798  for (b = 0; b < bs_last_barrier; ++b) {
3799  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3800 #if USE_DEBUGGER
3801  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3802 #endif
3803  }
3804  }
3805  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3806  KMP_INIT_BARRIER_STATE);
3807 
3808 #if KMP_AFFINITY_SUPPORTED
3809 #if OMP_40_ENABLED
3810  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3811  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3812  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3813  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3814 #endif
3815  if (TCR_4(__kmp_init_middle)) {
3816  __kmp_affinity_set_init_mask(gtid, TRUE);
3817  }
3818 #endif /* KMP_AFFINITY_SUPPORTED */
3819 #if OMP_50_ENABLED
3820  root_thread->th.th_def_allocator = __kmp_def_allocator;
3821 #endif
3822 
3823  __kmp_root_counter++;
3824 
3825 #if OMPT_SUPPORT
3826  if (!initial_thread && ompt_enabled.enabled) {
3827 
3828  kmp_info_t *root_thread = ompt_get_thread();
3829 
3830  ompt_set_thread_state(root_thread, omp_state_overhead);
3831 
3832  if (ompt_enabled.ompt_callback_thread_begin) {
3833  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3834  ompt_thread_initial, __ompt_get_thread_data_internal());
3835  }
3836  ompt_data_t *task_data;
3837  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3838  if (ompt_enabled.ompt_callback_task_create) {
3839  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3840  NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3841  // initial task has nothing to return to
3842  }
3843 
3844  ompt_set_thread_state(root_thread, omp_state_work_serial);
3845  }
3846 #endif
3847 
3848  KMP_MB();
3849  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3850 
3851  return gtid;
3852 }
3853 
3854 #if KMP_NESTED_HOT_TEAMS
3855 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3856  const int max_level) {
3857  int i, n, nth;
3858  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3859  if (!hot_teams || !hot_teams[level].hot_team) {
3860  return 0;
3861  }
3862  KMP_DEBUG_ASSERT(level < max_level);
3863  kmp_team_t *team = hot_teams[level].hot_team;
3864  nth = hot_teams[level].hot_team_nth;
3865  n = nth - 1; // master is not freed
3866  if (level < max_level - 1) {
3867  for (i = 0; i < nth; ++i) {
3868  kmp_info_t *th = team->t.t_threads[i];
3869  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3870  if (i > 0 && th->th.th_hot_teams) {
3871  __kmp_free(th->th.th_hot_teams);
3872  th->th.th_hot_teams = NULL;
3873  }
3874  }
3875  }
3876  __kmp_free_team(root, team, NULL);
3877  return n;
3878 }
3879 #endif
3880 
3881 // Resets a root thread and clear its root and hot teams.
3882 // Returns the number of __kmp_threads entries directly and indirectly freed.
3883 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3884  kmp_team_t *root_team = root->r.r_root_team;
3885  kmp_team_t *hot_team = root->r.r_hot_team;
3886  int n = hot_team->t.t_nproc;
3887  int i;
3888 
3889  KMP_DEBUG_ASSERT(!root->r.r_active);
3890 
3891  root->r.r_root_team = NULL;
3892  root->r.r_hot_team = NULL;
3893  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3894  // before call to __kmp_free_team().
3895  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3896 #if KMP_NESTED_HOT_TEAMS
3897  if (__kmp_hot_teams_max_level >
3898  0) { // need to free nested hot teams and their threads if any
3899  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3900  kmp_info_t *th = hot_team->t.t_threads[i];
3901  if (__kmp_hot_teams_max_level > 1) {
3902  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3903  }
3904  if (th->th.th_hot_teams) {
3905  __kmp_free(th->th.th_hot_teams);
3906  th->th.th_hot_teams = NULL;
3907  }
3908  }
3909  }
3910 #endif
3911  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3912 
3913  // Before we can reap the thread, we need to make certain that all other
3914  // threads in the teams that had this root as ancestor have stopped trying to
3915  // steal tasks.
3916  if (__kmp_tasking_mode != tskm_immediate_exec) {
3917  __kmp_wait_to_unref_task_teams();
3918  }
3919 
3920 #if KMP_OS_WINDOWS
3921  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3922  KA_TRACE(
3923  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3924  "\n",
3925  (LPVOID) & (root->r.r_uber_thread->th),
3926  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3927  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3928 #endif /* KMP_OS_WINDOWS */
3929 
3930 #if OMPT_SUPPORT
3931  if (ompt_enabled.ompt_callback_thread_end) {
3932  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3933  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3934  }
3935 #endif
3936 
3937  TCW_4(__kmp_nth,
3938  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3939  root->r.r_cg_nthreads--;
3940 
3941  __kmp_reap_thread(root->r.r_uber_thread, 1);
3942 
3943  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3944  // of freeing.
3945  root->r.r_uber_thread = NULL;
3946  /* mark root as no longer in use */
3947  root->r.r_begin = FALSE;
3948 
3949  return n;
3950 }
3951 
3952 void __kmp_unregister_root_current_thread(int gtid) {
3953  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3954  /* this lock should be ok, since unregister_root_current_thread is never
3955  called during an abort, only during a normal close. furthermore, if you
3956  have the forkjoin lock, you should never try to get the initz lock */
3957  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3958  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3959  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3960  "exiting T#%d\n",
3961  gtid));
3962  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3963  return;
3964  }
3965  kmp_root_t *root = __kmp_root[gtid];
3966 
3967  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3968  KMP_ASSERT(KMP_UBER_GTID(gtid));
3969  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3970  KMP_ASSERT(root->r.r_active == FALSE);
3971 
3972  KMP_MB();
3973 
3974 #if OMP_45_ENABLED
3975  kmp_info_t *thread = __kmp_threads[gtid];
3976  kmp_team_t *team = thread->th.th_team;
3977  kmp_task_team_t *task_team = thread->th.th_task_team;
3978 
3979  // we need to wait for the proxy tasks before finishing the thread
3980  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3981 #if OMPT_SUPPORT
3982  // the runtime is shutting down so we won't report any events
3983  thread->th.ompt_thread_info.state = omp_state_undefined;
3984 #endif
3985  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3986  }
3987 #endif
3988 
3989  __kmp_reset_root(gtid, root);
3990 
3991  /* free up this thread slot */
3992  __kmp_gtid_set_specific(KMP_GTID_DNE);
3993 #ifdef KMP_TDATA_GTID
3994  __kmp_gtid = KMP_GTID_DNE;
3995 #endif
3996 
3997  KMP_MB();
3998  KC_TRACE(10,
3999  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4000 
4001  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4002 }
4003 
4004 #if KMP_OS_WINDOWS
4005 /* __kmp_forkjoin_lock must be already held
4006  Unregisters a root thread that is not the current thread. Returns the number
4007  of __kmp_threads entries freed as a result. */
4008 static int __kmp_unregister_root_other_thread(int gtid) {
4009  kmp_root_t *root = __kmp_root[gtid];
4010  int r;
4011 
4012  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4013  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4014  KMP_ASSERT(KMP_UBER_GTID(gtid));
4015  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4016  KMP_ASSERT(root->r.r_active == FALSE);
4017 
4018  r = __kmp_reset_root(gtid, root);
4019  KC_TRACE(10,
4020  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4021  return r;
4022 }
4023 #endif
4024 
4025 #if KMP_DEBUG
4026 void __kmp_task_info() {
4027 
4028  kmp_int32 gtid = __kmp_entry_gtid();
4029  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4030  kmp_info_t *this_thr = __kmp_threads[gtid];
4031  kmp_team_t *steam = this_thr->th.th_serial_team;
4032  kmp_team_t *team = this_thr->th.th_team;
4033 
4034  __kmp_printf(
4035  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4036  "ptask=%p\n",
4037  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4038  team->t.t_implicit_task_taskdata[tid].td_parent);
4039 }
4040 #endif // KMP_DEBUG
4041 
4042 /* TODO optimize with one big memclr, take out what isn't needed, split
4043  responsibility to workers as much as possible, and delay initialization of
4044  features as much as possible */
4045 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4046  int tid, int gtid) {
4047  /* this_thr->th.th_info.ds.ds_gtid is setup in
4048  kmp_allocate_thread/create_worker.
4049  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4050  kmp_info_t *master = team->t.t_threads[0];
4051  KMP_DEBUG_ASSERT(this_thr != NULL);
4052  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4053  KMP_DEBUG_ASSERT(team);
4054  KMP_DEBUG_ASSERT(team->t.t_threads);
4055  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4056  KMP_DEBUG_ASSERT(master);
4057  KMP_DEBUG_ASSERT(master->th.th_root);
4058 
4059  KMP_MB();
4060 
4061  TCW_SYNC_PTR(this_thr->th.th_team, team);
4062 
4063  this_thr->th.th_info.ds.ds_tid = tid;
4064  this_thr->th.th_set_nproc = 0;
4065  if (__kmp_tasking_mode != tskm_immediate_exec)
4066  // When tasking is possible, threads are not safe to reap until they are
4067  // done tasking; this will be set when tasking code is exited in wait
4068  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4069  else // no tasking --> always safe to reap
4070  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4071 #if OMP_40_ENABLED
4072  this_thr->th.th_set_proc_bind = proc_bind_default;
4073 #if KMP_AFFINITY_SUPPORTED
4074  this_thr->th.th_new_place = this_thr->th.th_current_place;
4075 #endif
4076 #endif
4077  this_thr->th.th_root = master->th.th_root;
4078 
4079  /* setup the thread's cache of the team structure */
4080  this_thr->th.th_team_nproc = team->t.t_nproc;
4081  this_thr->th.th_team_master = master;
4082  this_thr->th.th_team_serialized = team->t.t_serialized;
4083  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4084 
4085  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4086 
4087  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4088  tid, gtid, this_thr, this_thr->th.th_current_task));
4089 
4090  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4091  team, tid, TRUE);
4092 
4093  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4094  tid, gtid, this_thr, this_thr->th.th_current_task));
4095  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4096  // __kmp_initialize_team()?
4097 
4098  /* TODO no worksharing in speculative threads */
4099  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4100 
4101  this_thr->th.th_local.this_construct = 0;
4102 
4103  if (!this_thr->th.th_pri_common) {
4104  this_thr->th.th_pri_common =
4105  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4106  if (__kmp_storage_map) {
4107  __kmp_print_storage_map_gtid(
4108  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4109  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4110  }
4111  this_thr->th.th_pri_head = NULL;
4112  }
4113 
4114  /* Initialize dynamic dispatch */
4115  {
4116  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4117  // Use team max_nproc since this will never change for the team.
4118  size_t disp_size =
4119  sizeof(dispatch_private_info_t) *
4120  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4121  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4122  team->t.t_max_nproc));
4123  KMP_ASSERT(dispatch);
4124  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4125  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4126 
4127  dispatch->th_disp_index = 0;
4128 #if OMP_45_ENABLED
4129  dispatch->th_doacross_buf_idx = 0;
4130 #endif
4131  if (!dispatch->th_disp_buffer) {
4132  dispatch->th_disp_buffer =
4133  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4134 
4135  if (__kmp_storage_map) {
4136  __kmp_print_storage_map_gtid(
4137  gtid, &dispatch->th_disp_buffer[0],
4138  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4139  ? 1
4140  : __kmp_dispatch_num_buffers],
4141  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4142  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4143  gtid, team->t.t_id, gtid);
4144  }
4145  } else {
4146  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4147  }
4148 
4149  dispatch->th_dispatch_pr_current = 0;
4150  dispatch->th_dispatch_sh_current = 0;
4151 
4152  dispatch->th_deo_fcn = 0; /* ORDERED */
4153  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4154  }
4155 
4156  this_thr->th.th_next_pool = NULL;
4157 
4158  if (!this_thr->th.th_task_state_memo_stack) {
4159  size_t i;
4160  this_thr->th.th_task_state_memo_stack =
4161  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4162  this_thr->th.th_task_state_top = 0;
4163  this_thr->th.th_task_state_stack_sz = 4;
4164  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4165  ++i) // zero init the stack
4166  this_thr->th.th_task_state_memo_stack[i] = 0;
4167  }
4168 
4169  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4170  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4171 
4172  KMP_MB();
4173 }
4174 
4175 /* allocate a new thread for the requesting team. this is only called from
4176  within a forkjoin critical section. we will first try to get an available
4177  thread from the thread pool. if none is available, we will fork a new one
4178  assuming we are able to create a new one. this should be assured, as the
4179  caller should check on this first. */
4180 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4181  int new_tid) {
4182  kmp_team_t *serial_team;
4183  kmp_info_t *new_thr;
4184  int new_gtid;
4185 
4186  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4187  KMP_DEBUG_ASSERT(root && team);
4188 #if !KMP_NESTED_HOT_TEAMS
4189  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4190 #endif
4191  KMP_MB();
4192 
4193  /* first, try to get one from the thread pool */
4194  if (__kmp_thread_pool) {
4195 
4196  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4197  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4198  if (new_thr == __kmp_thread_pool_insert_pt) {
4199  __kmp_thread_pool_insert_pt = NULL;
4200  }
4201  TCW_4(new_thr->th.th_in_pool, FALSE);
4202  // Don't touch th_active_in_pool or th_active.
4203  // The worker thread adjusts those flags as it sleeps/awakens.
4204  __kmp_thread_pool_nth--;
4205 
4206  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4207  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4208  KMP_ASSERT(!new_thr->th.th_team);
4209  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4210  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4211 
4212  /* setup the thread structure */
4213  __kmp_initialize_info(new_thr, team, new_tid,
4214  new_thr->th.th_info.ds.ds_gtid);
4215  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4216 
4217  TCW_4(__kmp_nth, __kmp_nth + 1);
4218  root->r.r_cg_nthreads++;
4219 
4220  new_thr->th.th_task_state = 0;
4221  new_thr->th.th_task_state_top = 0;
4222  new_thr->th.th_task_state_stack_sz = 4;
4223 
4224 #ifdef KMP_ADJUST_BLOCKTIME
4225  /* Adjust blocktime back to zero if necessary */
4226  /* Middle initialization might not have occurred yet */
4227  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4228  if (__kmp_nth > __kmp_avail_proc) {
4229  __kmp_zero_bt = TRUE;
4230  }
4231  }
4232 #endif /* KMP_ADJUST_BLOCKTIME */
4233 
4234 #if KMP_DEBUG
4235  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4236  // KMP_BARRIER_PARENT_FLAG.
4237  int b;
4238  kmp_balign_t *balign = new_thr->th.th_bar;
4239  for (b = 0; b < bs_last_barrier; ++b)
4240  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4241 #endif
4242 
4243  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4244  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4245 
4246  KMP_MB();
4247  return new_thr;
4248  }
4249 
4250  /* no, well fork a new one */
4251  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4252  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4253 
4254 #if KMP_USE_MONITOR
4255  // If this is the first worker thread the RTL is creating, then also
4256  // launch the monitor thread. We try to do this as early as possible.
4257  if (!TCR_4(__kmp_init_monitor)) {
4258  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4259  if (!TCR_4(__kmp_init_monitor)) {
4260  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4261  TCW_4(__kmp_init_monitor, 1);
4262  __kmp_create_monitor(&__kmp_monitor);
4263  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4264 #if KMP_OS_WINDOWS
4265  // AC: wait until monitor has started. This is a fix for CQ232808.
4266  // The reason is that if the library is loaded/unloaded in a loop with
4267  // small (parallel) work in between, then there is high probability that
4268  // monitor thread started after the library shutdown. At shutdown it is
4269  // too late to cope with the problem, because when the master is in
4270  // DllMain (process detach) the monitor has no chances to start (it is
4271  // blocked), and master has no means to inform the monitor that the
4272  // library has gone, because all the memory which the monitor can access
4273  // is going to be released/reset.
4274  while (TCR_4(__kmp_init_monitor) < 2) {
4275  KMP_YIELD(TRUE);
4276  }
4277  KF_TRACE(10, ("after monitor thread has started\n"));
4278 #endif
4279  }
4280  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4281  }
4282 #endif
4283 
4284  KMP_MB();
4285  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4286  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4287  }
4288 
4289  /* allocate space for it. */
4290  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4291 
4292  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4293 
4294  if (__kmp_storage_map) {
4295  __kmp_print_thread_storage_map(new_thr, new_gtid);
4296  }
4297 
4298  // add the reserve serialized team, initialized from the team's master thread
4299  {
4300  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4301  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4302  new_thr->th.th_serial_team = serial_team =
4303  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4304 #if OMPT_SUPPORT
4305  ompt_data_none, // root parallel id
4306 #endif
4307 #if OMP_40_ENABLED
4308  proc_bind_default,
4309 #endif
4310  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4311  }
4312  KMP_ASSERT(serial_team);
4313  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4314  // execution (it is unused for now).
4315  serial_team->t.t_threads[0] = new_thr;
4316  KF_TRACE(10,
4317  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4318  new_thr));
4319 
4320  /* setup the thread structures */
4321  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4322 
4323 #if USE_FAST_MEMORY
4324  __kmp_initialize_fast_memory(new_thr);
4325 #endif /* USE_FAST_MEMORY */
4326 
4327 #if KMP_USE_BGET
4328  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4329  __kmp_initialize_bget(new_thr);
4330 #endif
4331 
4332  __kmp_init_random(new_thr); // Initialize random number generator
4333 
4334  /* Initialize these only once when thread is grabbed for a team allocation */
4335  KA_TRACE(20,
4336  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4337  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4338 
4339  int b;
4340  kmp_balign_t *balign = new_thr->th.th_bar;
4341  for (b = 0; b < bs_last_barrier; ++b) {
4342  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4343  balign[b].bb.team = NULL;
4344  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4345  balign[b].bb.use_oncore_barrier = 0;
4346  }
4347 
4348  new_thr->th.th_spin_here = FALSE;
4349  new_thr->th.th_next_waiting = 0;
4350 #if KMP_OS_UNIX
4351  new_thr->th.th_blocking = false;
4352 #endif
4353 
4354 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4355  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4356  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4357  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4358  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4359 #endif
4360 #if OMP_50_ENABLED
4361  new_thr->th.th_def_allocator = __kmp_def_allocator;
4362 #endif
4363 
4364  TCW_4(new_thr->th.th_in_pool, FALSE);
4365  new_thr->th.th_active_in_pool = FALSE;
4366  TCW_4(new_thr->th.th_active, TRUE);
4367 
4368  /* adjust the global counters */
4369  __kmp_all_nth++;
4370  __kmp_nth++;
4371 
4372  root->r.r_cg_nthreads++;
4373 
4374  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4375  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4376  if (__kmp_adjust_gtid_mode) {
4377  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4378  if (TCR_4(__kmp_gtid_mode) != 2) {
4379  TCW_4(__kmp_gtid_mode, 2);
4380  }
4381  } else {
4382  if (TCR_4(__kmp_gtid_mode) != 1) {
4383  TCW_4(__kmp_gtid_mode, 1);
4384  }
4385  }
4386  }
4387 
4388 #ifdef KMP_ADJUST_BLOCKTIME
4389  /* Adjust blocktime back to zero if necessary */
4390  /* Middle initialization might not have occurred yet */
4391  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4392  if (__kmp_nth > __kmp_avail_proc) {
4393  __kmp_zero_bt = TRUE;
4394  }
4395  }
4396 #endif /* KMP_ADJUST_BLOCKTIME */
4397 
4398  /* actually fork it and create the new worker thread */
4399  KF_TRACE(
4400  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4401  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4402  KF_TRACE(10,
4403  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4404 
4405  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4406  new_gtid));
4407  KMP_MB();
4408  return new_thr;
4409 }
4410 
4411 /* Reinitialize team for reuse.
4412  The hot team code calls this case at every fork barrier, so EPCC barrier
4413  test are extremely sensitive to changes in it, esp. writes to the team
4414  struct, which cause a cache invalidation in all threads.
4415  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4416 static void __kmp_reinitialize_team(kmp_team_t *team,
4417  kmp_internal_control_t *new_icvs,
4418  ident_t *loc) {
4419  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4420  team->t.t_threads[0], team));
4421  KMP_DEBUG_ASSERT(team && new_icvs);
4422  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4423  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4424 
4425  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4426  // Copy ICVs to the master thread's implicit taskdata
4427  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4428  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4429 
4430  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4431  team->t.t_threads[0], team));
4432 }
4433 
4434 /* Initialize the team data structure.
4435  This assumes the t_threads and t_max_nproc are already set.
4436  Also, we don't touch the arguments */
4437 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4438  kmp_internal_control_t *new_icvs,
4439  ident_t *loc) {
4440  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4441 
4442  /* verify */
4443  KMP_DEBUG_ASSERT(team);
4444  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4445  KMP_DEBUG_ASSERT(team->t.t_threads);
4446  KMP_MB();
4447 
4448  team->t.t_master_tid = 0; /* not needed */
4449  /* team->t.t_master_bar; not needed */
4450  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4451  team->t.t_nproc = new_nproc;
4452 
4453  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4454  team->t.t_next_pool = NULL;
4455  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4456  * up hot team */
4457 
4458  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4459  team->t.t_invoke = NULL; /* not needed */
4460 
4461  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4462  team->t.t_sched.sched = new_icvs->sched.sched;
4463 
4464 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4465  team->t.t_fp_control_saved = FALSE; /* not needed */
4466  team->t.t_x87_fpu_control_word = 0; /* not needed */
4467  team->t.t_mxcsr = 0; /* not needed */
4468 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4469 
4470  team->t.t_construct = 0;
4471 
4472  team->t.t_ordered.dt.t_value = 0;
4473  team->t.t_master_active = FALSE;
4474 
4475  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4476 
4477 #ifdef KMP_DEBUG
4478  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4479 #endif
4480 #if KMP_OS_WINDOWS
4481  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4482 #endif
4483 
4484  team->t.t_control_stack_top = NULL;
4485 
4486  __kmp_reinitialize_team(team, new_icvs, loc);
4487 
4488  KMP_MB();
4489  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4490 }
4491 
4492 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4493 /* Sets full mask for thread and returns old mask, no changes to structures. */
4494 static void
4495 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4496  if (KMP_AFFINITY_CAPABLE()) {
4497  int status;
4498  if (old_mask != NULL) {
4499  status = __kmp_get_system_affinity(old_mask, TRUE);
4500  int error = errno;
4501  if (status != 0) {
4502  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4503  __kmp_msg_null);
4504  }
4505  }
4506  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4507  }
4508 }
4509 #endif
4510 
4511 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4512 
4513 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4514 // It calculats the worker + master thread's partition based upon the parent
4515 // thread's partition, and binds each worker to a thread in their partition.
4516 // The master thread's partition should already include its current binding.
4517 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4518  // Copy the master thread's place partion to the team struct
4519  kmp_info_t *master_th = team->t.t_threads[0];
4520  KMP_DEBUG_ASSERT(master_th != NULL);
4521  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4522  int first_place = master_th->th.th_first_place;
4523  int last_place = master_th->th.th_last_place;
4524  int masters_place = master_th->th.th_current_place;
4525  team->t.t_first_place = first_place;
4526  team->t.t_last_place = last_place;
4527 
4528  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4529  "bound to place %d partition = [%d,%d]\n",
4530  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4531  team->t.t_id, masters_place, first_place, last_place));
4532 
4533  switch (proc_bind) {
4534 
4535  case proc_bind_default:
4536  // serial teams might have the proc_bind policy set to proc_bind_default. It
4537  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4538  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4539  break;
4540 
4541  case proc_bind_master: {
4542  int f;
4543  int n_th = team->t.t_nproc;
4544  for (f = 1; f < n_th; f++) {
4545  kmp_info_t *th = team->t.t_threads[f];
4546  KMP_DEBUG_ASSERT(th != NULL);
4547  th->th.th_first_place = first_place;
4548  th->th.th_last_place = last_place;
4549  th->th.th_new_place = masters_place;
4550 
4551  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4552  "partition = [%d,%d]\n",
4553  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4554  f, masters_place, first_place, last_place));
4555  }
4556  } break;
4557 
4558  case proc_bind_close: {
4559  int f;
4560  int n_th = team->t.t_nproc;
4561  int n_places;
4562  if (first_place <= last_place) {
4563  n_places = last_place - first_place + 1;
4564  } else {
4565  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4566  }
4567  if (n_th <= n_places) {
4568  int place = masters_place;
4569  for (f = 1; f < n_th; f++) {
4570  kmp_info_t *th = team->t.t_threads[f];
4571  KMP_DEBUG_ASSERT(th != NULL);
4572 
4573  if (place == last_place) {
4574  place = first_place;
4575  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4576  place = 0;
4577  } else {
4578  place++;
4579  }
4580  th->th.th_first_place = first_place;
4581  th->th.th_last_place = last_place;
4582  th->th.th_new_place = place;
4583 
4584  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4585  "partition = [%d,%d]\n",
4586  __kmp_gtid_from_thread(team->t.t_threads[f]),
4587  team->t.t_id, f, place, first_place, last_place));
4588  }
4589  } else {
4590  int S, rem, gap, s_count;
4591  S = n_th / n_places;
4592  s_count = 0;
4593  rem = n_th - (S * n_places);
4594  gap = rem > 0 ? n_places / rem : n_places;
4595  int place = masters_place;
4596  int gap_ct = gap;
4597  for (f = 0; f < n_th; f++) {
4598  kmp_info_t *th = team->t.t_threads[f];
4599  KMP_DEBUG_ASSERT(th != NULL);
4600 
4601  th->th.th_first_place = first_place;
4602  th->th.th_last_place = last_place;
4603  th->th.th_new_place = place;
4604  s_count++;
4605 
4606  if ((s_count == S) && rem && (gap_ct == gap)) {
4607  // do nothing, add an extra thread to place on next iteration
4608  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4609  // we added an extra thread to this place; move to next place
4610  if (place == last_place) {
4611  place = first_place;
4612  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4613  place = 0;
4614  } else {
4615  place++;
4616  }
4617  s_count = 0;
4618  gap_ct = 1;
4619  rem--;
4620  } else if (s_count == S) { // place full; don't add extra
4621  if (place == last_place) {
4622  place = first_place;
4623  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4624  place = 0;
4625  } else {
4626  place++;
4627  }
4628  gap_ct++;
4629  s_count = 0;
4630  }
4631 
4632  KA_TRACE(100,
4633  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4634  "partition = [%d,%d]\n",
4635  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4636  th->th.th_new_place, first_place, last_place));
4637  }
4638  KMP_DEBUG_ASSERT(place == masters_place);
4639  }
4640  } break;
4641 
4642  case proc_bind_spread: {
4643  int f;
4644  int n_th = team->t.t_nproc;
4645  int n_places;
4646  int thidx;
4647  if (first_place <= last_place) {
4648  n_places = last_place - first_place + 1;
4649  } else {
4650  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4651  }
4652  if (n_th <= n_places) {
4653  int place = -1;
4654 
4655  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4656  int S = n_places / n_th;
4657  int s_count, rem, gap, gap_ct;
4658 
4659  place = masters_place;
4660  rem = n_places - n_th * S;
4661  gap = rem ? n_th / rem : 1;
4662  gap_ct = gap;
4663  thidx = n_th;
4664  if (update_master_only == 1)
4665  thidx = 1;
4666  for (f = 0; f < thidx; f++) {
4667  kmp_info_t *th = team->t.t_threads[f];
4668  KMP_DEBUG_ASSERT(th != NULL);
4669 
4670  th->th.th_first_place = place;
4671  th->th.th_new_place = place;
4672  s_count = 1;
4673  while (s_count < S) {
4674  if (place == last_place) {
4675  place = first_place;
4676  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4677  place = 0;
4678  } else {
4679  place++;
4680  }
4681  s_count++;
4682  }
4683  if (rem && (gap_ct == gap)) {
4684  if (place == last_place) {
4685  place = first_place;
4686  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4687  place = 0;
4688  } else {
4689  place++;
4690  }
4691  rem--;
4692  gap_ct = 0;
4693  }
4694  th->th.th_last_place = place;
4695  gap_ct++;
4696 
4697  if (place == last_place) {
4698  place = first_place;
4699  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4700  place = 0;
4701  } else {
4702  place++;
4703  }
4704 
4705  KA_TRACE(100,
4706  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4707  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4708  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4709  f, th->th.th_new_place, th->th.th_first_place,
4710  th->th.th_last_place, __kmp_affinity_num_masks));
4711  }
4712  } else {
4713  /* Having uniform space of available computation places I can create
4714  T partitions of round(P/T) size and put threads into the first
4715  place of each partition. */
4716  double current = static_cast<double>(masters_place);
4717  double spacing =
4718  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4719  int first, last;
4720  kmp_info_t *th;
4721 
4722  thidx = n_th + 1;
4723  if (update_master_only == 1)
4724  thidx = 1;
4725  for (f = 0; f < thidx; f++) {
4726  first = static_cast<int>(current);
4727  last = static_cast<int>(current + spacing) - 1;
4728  KMP_DEBUG_ASSERT(last >= first);
4729  if (first >= n_places) {
4730  if (masters_place) {
4731  first -= n_places;
4732  last -= n_places;
4733  if (first == (masters_place + 1)) {
4734  KMP_DEBUG_ASSERT(f == n_th);
4735  first--;
4736  }
4737  if (last == masters_place) {
4738  KMP_DEBUG_ASSERT(f == (n_th - 1));
4739  last--;
4740  }
4741  } else {
4742  KMP_DEBUG_ASSERT(f == n_th);
4743  first = 0;
4744  last = 0;
4745  }
4746  }
4747  if (last >= n_places) {
4748  last = (n_places - 1);
4749  }
4750  place = first;
4751  current += spacing;
4752  if (f < n_th) {
4753  KMP_DEBUG_ASSERT(0 <= first);
4754  KMP_DEBUG_ASSERT(n_places > first);
4755  KMP_DEBUG_ASSERT(0 <= last);
4756  KMP_DEBUG_ASSERT(n_places > last);
4757  KMP_DEBUG_ASSERT(last_place >= first_place);
4758  th = team->t.t_threads[f];
4759  KMP_DEBUG_ASSERT(th);
4760  th->th.th_first_place = first;
4761  th->th.th_new_place = place;
4762  th->th.th_last_place = last;
4763 
4764  KA_TRACE(100,
4765  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4766  "partition = [%d,%d], spacing = %.4f\n",
4767  __kmp_gtid_from_thread(team->t.t_threads[f]),
4768  team->t.t_id, f, th->th.th_new_place,
4769  th->th.th_first_place, th->th.th_last_place, spacing));
4770  }
4771  }
4772  }
4773  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4774  } else {
4775  int S, rem, gap, s_count;
4776  S = n_th / n_places;
4777  s_count = 0;
4778  rem = n_th - (S * n_places);
4779  gap = rem > 0 ? n_places / rem : n_places;
4780  int place = masters_place;
4781  int gap_ct = gap;
4782  thidx = n_th;
4783  if (update_master_only == 1)
4784  thidx = 1;
4785  for (f = 0; f < thidx; f++) {
4786  kmp_info_t *th = team->t.t_threads[f];
4787  KMP_DEBUG_ASSERT(th != NULL);
4788 
4789  th->th.th_first_place = place;
4790  th->th.th_last_place = place;
4791  th->th.th_new_place = place;
4792  s_count++;
4793 
4794  if ((s_count == S) && rem && (gap_ct == gap)) {
4795  // do nothing, add an extra thread to place on next iteration
4796  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4797  // we added an extra thread to this place; move on to next place
4798  if (place == last_place) {
4799  place = first_place;
4800  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4801  place = 0;
4802  } else {
4803  place++;
4804  }
4805  s_count = 0;
4806  gap_ct = 1;
4807  rem--;
4808  } else if (s_count == S) { // place is full; don't add extra thread
4809  if (place == last_place) {
4810  place = first_place;
4811  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4812  place = 0;
4813  } else {
4814  place++;
4815  }
4816  gap_ct++;
4817  s_count = 0;
4818  }
4819 
4820  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4821  "partition = [%d,%d]\n",
4822  __kmp_gtid_from_thread(team->t.t_threads[f]),
4823  team->t.t_id, f, th->th.th_new_place,
4824  th->th.th_first_place, th->th.th_last_place));
4825  }
4826  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4827  }
4828  } break;
4829 
4830  default:
4831  break;
4832  }
4833 
4834  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4835 }
4836 
4837 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4838 
4839 /* allocate a new team data structure to use. take one off of the free pool if
4840  available */
4841 kmp_team_t *
4842 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4843 #if OMPT_SUPPORT
4844  ompt_data_t ompt_parallel_data,
4845 #endif
4846 #if OMP_40_ENABLED
4847  kmp_proc_bind_t new_proc_bind,
4848 #endif
4849  kmp_internal_control_t *new_icvs,
4850  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4851  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4852  int f;
4853  kmp_team_t *team;
4854  int use_hot_team = !root->r.r_active;
4855  int level = 0;
4856 
4857  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4858  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4859  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4860  KMP_MB();
4861 
4862 #if KMP_NESTED_HOT_TEAMS
4863  kmp_hot_team_ptr_t *hot_teams;
4864  if (master) {
4865  team = master->th.th_team;
4866  level = team->t.t_active_level;
4867  if (master->th.th_teams_microtask) { // in teams construct?
4868  if (master->th.th_teams_size.nteams > 1 &&
4869  ( // #teams > 1
4870  team->t.t_pkfn ==
4871  (microtask_t)__kmp_teams_master || // inner fork of the teams
4872  master->th.th_teams_level <
4873  team->t.t_level)) { // or nested parallel inside the teams
4874  ++level; // not increment if #teams==1, or for outer fork of the teams;
4875  // increment otherwise
4876  }
4877  }
4878  hot_teams = master->th.th_hot_teams;
4879  if (level < __kmp_hot_teams_max_level && hot_teams &&
4880  hot_teams[level]
4881  .hot_team) { // hot team has already been allocated for given level
4882  use_hot_team = 1;
4883  } else {
4884  use_hot_team = 0;
4885  }
4886  }
4887 #endif
4888  // Optimization to use a "hot" team
4889  if (use_hot_team && new_nproc > 1) {
4890  KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4891 #if KMP_NESTED_HOT_TEAMS
4892  team = hot_teams[level].hot_team;
4893 #else
4894  team = root->r.r_hot_team;
4895 #endif
4896 #if KMP_DEBUG
4897  if (__kmp_tasking_mode != tskm_immediate_exec) {
4898  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4899  "task_team[1] = %p before reinit\n",
4900  team->t.t_task_team[0], team->t.t_task_team[1]));
4901  }
4902 #endif
4903 
4904  // Has the number of threads changed?
4905  /* Let's assume the most common case is that the number of threads is
4906  unchanged, and put that case first. */
4907  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4908  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4909  // This case can mean that omp_set_num_threads() was called and the hot
4910  // team size was already reduced, so we check the special flag
4911  if (team->t.t_size_changed == -1) {
4912  team->t.t_size_changed = 1;
4913  } else {
4914  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4915  }
4916 
4917  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4918  kmp_r_sched_t new_sched = new_icvs->sched;
4919  // set master's schedule as new run-time schedule
4920  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4921 
4922  __kmp_reinitialize_team(team, new_icvs,
4923  root->r.r_uber_thread->th.th_ident);
4924 
4925  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4926  team->t.t_threads[0], team));
4927  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4928 
4929 #if OMP_40_ENABLED
4930 #if KMP_AFFINITY_SUPPORTED
4931  if ((team->t.t_size_changed == 0) &&
4932  (team->t.t_proc_bind == new_proc_bind)) {
4933  if (new_proc_bind == proc_bind_spread) {
4934  __kmp_partition_places(
4935  team, 1); // add flag to update only master for spread
4936  }
4937  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4938  "proc_bind = %d, partition = [%d,%d]\n",
4939  team->t.t_id, new_proc_bind, team->t.t_first_place,
4940  team->t.t_last_place));
4941  } else {
4942  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4943  __kmp_partition_places(team);
4944  }
4945 #else
4946  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4947 #endif /* KMP_AFFINITY_SUPPORTED */
4948 #endif /* OMP_40_ENABLED */
4949  } else if (team->t.t_nproc > new_nproc) {
4950  KA_TRACE(20,
4951  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4952  new_nproc));
4953 
4954  team->t.t_size_changed = 1;
4955 #if KMP_NESTED_HOT_TEAMS
4956  if (__kmp_hot_teams_mode == 0) {
4957  // AC: saved number of threads should correspond to team's value in this
4958  // mode, can be bigger in mode 1, when hot team has threads in reserve
4959  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4960  hot_teams[level].hot_team_nth = new_nproc;
4961 #endif // KMP_NESTED_HOT_TEAMS
4962  /* release the extra threads we don't need any more */
4963  for (f = new_nproc; f < team->t.t_nproc; f++) {
4964  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4965  if (__kmp_tasking_mode != tskm_immediate_exec) {
4966  // When decreasing team size, threads no longer in the team should
4967  // unref task team.
4968  team->t.t_threads[f]->th.th_task_team = NULL;
4969  }
4970  __kmp_free_thread(team->t.t_threads[f]);
4971  team->t.t_threads[f] = NULL;
4972  }
4973 #if KMP_NESTED_HOT_TEAMS
4974  } // (__kmp_hot_teams_mode == 0)
4975  else {
4976  // When keeping extra threads in team, switch threads to wait on own
4977  // b_go flag
4978  for (f = new_nproc; f < team->t.t_nproc; ++f) {
4979  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4980  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4981  for (int b = 0; b < bs_last_barrier; ++b) {
4982  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4983  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4984  }
4985  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4986  }
4987  }
4988  }
4989 #endif // KMP_NESTED_HOT_TEAMS
4990  team->t.t_nproc = new_nproc;
4991  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4992  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4993  __kmp_reinitialize_team(team, new_icvs,
4994  root->r.r_uber_thread->th.th_ident);
4995 
4996  /* update the remaining threads */
4997  for (f = 0; f < new_nproc; ++f) {
4998  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4999  }
5000  // restore the current task state of the master thread: should be the
5001  // implicit task
5002  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5003  team->t.t_threads[0], team));
5004 
5005  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5006 
5007 #ifdef KMP_DEBUG
5008  for (f = 0; f < team->t.t_nproc; f++) {
5009  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5010  team->t.t_threads[f]->th.th_team_nproc ==
5011  team->t.t_nproc);
5012  }
5013 #endif
5014 
5015 #if OMP_40_ENABLED
5016  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5017 #if KMP_AFFINITY_SUPPORTED
5018  __kmp_partition_places(team);
5019 #endif
5020 #endif
5021  } else { // team->t.t_nproc < new_nproc
5022 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5023  kmp_affin_mask_t *old_mask;
5024  if (KMP_AFFINITY_CAPABLE()) {
5025  KMP_CPU_ALLOC(old_mask);
5026  }
5027 #endif
5028 
5029  KA_TRACE(20,
5030  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5031  new_nproc));
5032 
5033  team->t.t_size_changed = 1;
5034 
5035 #if KMP_NESTED_HOT_TEAMS
5036  int avail_threads = hot_teams[level].hot_team_nth;
5037  if (new_nproc < avail_threads)
5038  avail_threads = new_nproc;
5039  kmp_info_t **other_threads = team->t.t_threads;
5040  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5041  // Adjust barrier data of reserved threads (if any) of the team
5042  // Other data will be set in __kmp_initialize_info() below.
5043  int b;
5044  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5045  for (b = 0; b < bs_last_barrier; ++b) {
5046  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5047  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5048 #if USE_DEBUGGER
5049  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5050 #endif
5051  }
5052  }
5053  if (hot_teams[level].hot_team_nth >= new_nproc) {
5054  // we have all needed threads in reserve, no need to allocate any
5055  // this only possible in mode 1, cannot have reserved threads in mode 0
5056  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5057  team->t.t_nproc = new_nproc; // just get reserved threads involved
5058  } else {
5059  // we may have some threads in reserve, but not enough
5060  team->t.t_nproc =
5061  hot_teams[level]
5062  .hot_team_nth; // get reserved threads involved if any
5063  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5064 #endif // KMP_NESTED_HOT_TEAMS
5065  if (team->t.t_max_nproc < new_nproc) {
5066  /* reallocate larger arrays */
5067  __kmp_reallocate_team_arrays(team, new_nproc);
5068  __kmp_reinitialize_team(team, new_icvs, NULL);
5069  }
5070 
5071 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5072  /* Temporarily set full mask for master thread before creation of
5073  workers. The reason is that workers inherit the affinity from master,
5074  so if a lot of workers are created on the single core quickly, they
5075  don't get a chance to set their own affinity for a long time. */
5076  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5077 #endif
5078 
5079  /* allocate new threads for the hot team */
5080  for (f = team->t.t_nproc; f < new_nproc; f++) {
5081  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5082  KMP_DEBUG_ASSERT(new_worker);
5083  team->t.t_threads[f] = new_worker;
5084 
5085  KA_TRACE(20,
5086  ("__kmp_allocate_team: team %d init T#%d arrived: "
5087  "join=%llu, plain=%llu\n",
5088  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5089  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5090  team->t.t_bar[bs_plain_barrier].b_arrived));
5091 
5092  { // Initialize barrier data for new threads.
5093  int b;
5094  kmp_balign_t *balign = new_worker->th.th_bar;
5095  for (b = 0; b < bs_last_barrier; ++b) {
5096  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5097  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5098  KMP_BARRIER_PARENT_FLAG);
5099 #if USE_DEBUGGER
5100  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5101 #endif
5102  }
5103  }
5104  }
5105 
5106 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5107  if (KMP_AFFINITY_CAPABLE()) {
5108  /* Restore initial master thread's affinity mask */
5109  __kmp_set_system_affinity(old_mask, TRUE);
5110  KMP_CPU_FREE(old_mask);
5111  }
5112 #endif
5113 #if KMP_NESTED_HOT_TEAMS
5114  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5115 #endif // KMP_NESTED_HOT_TEAMS
5116  /* make sure everyone is syncronized */
5117  int old_nproc = team->t.t_nproc; // save old value and use to update only
5118  // new threads below
5119  __kmp_initialize_team(team, new_nproc, new_icvs,
5120  root->r.r_uber_thread->th.th_ident);
5121 
5122  /* reinitialize the threads */
5123  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5124  for (f = 0; f < team->t.t_nproc; ++f)
5125  __kmp_initialize_info(team->t.t_threads[f], team, f,
5126  __kmp_gtid_from_tid(f, team));
5127  if (level) { // set th_task_state for new threads in nested hot team
5128  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5129  // only need to set the th_task_state for the new threads. th_task_state
5130  // for master thread will not be accurate until after this in
5131  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5132  // correct value.
5133  for (f = old_nproc; f < team->t.t_nproc; ++f)
5134  team->t.t_threads[f]->th.th_task_state =
5135  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5136  } else { // set th_task_state for new threads in non-nested hot team
5137  int old_state =
5138  team->t.t_threads[0]->th.th_task_state; // copy master's state
5139  for (f = old_nproc; f < team->t.t_nproc; ++f)
5140  team->t.t_threads[f]->th.th_task_state = old_state;
5141  }
5142 
5143 #ifdef KMP_DEBUG
5144  for (f = 0; f < team->t.t_nproc; ++f) {
5145  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5146  team->t.t_threads[f]->th.th_team_nproc ==
5147  team->t.t_nproc);
5148  }
5149 #endif
5150 
5151 #if OMP_40_ENABLED
5152  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5153 #if KMP_AFFINITY_SUPPORTED
5154  __kmp_partition_places(team);
5155 #endif
5156 #endif
5157  } // Check changes in number of threads
5158 
5159 #if OMP_40_ENABLED
5160  kmp_info_t *master = team->t.t_threads[0];
5161  if (master->th.th_teams_microtask) {
5162  for (f = 1; f < new_nproc; ++f) {
5163  // propagate teams construct specific info to workers
5164  kmp_info_t *thr = team->t.t_threads[f];
5165  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5166  thr->th.th_teams_level = master->th.th_teams_level;
5167  thr->th.th_teams_size = master->th.th_teams_size;
5168  }
5169  }
5170 #endif /* OMP_40_ENABLED */
5171 #if KMP_NESTED_HOT_TEAMS
5172  if (level) {
5173  // Sync barrier state for nested hot teams, not needed for outermost hot
5174  // team.
5175  for (f = 1; f < new_nproc; ++f) {
5176  kmp_info_t *thr = team->t.t_threads[f];
5177  int b;
5178  kmp_balign_t *balign = thr->th.th_bar;
5179  for (b = 0; b < bs_last_barrier; ++b) {
5180  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5181  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5182 #if USE_DEBUGGER
5183  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5184 #endif
5185  }
5186  }
5187  }
5188 #endif // KMP_NESTED_HOT_TEAMS
5189 
5190  /* reallocate space for arguments if necessary */
5191  __kmp_alloc_argv_entries(argc, team, TRUE);
5192  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5193  // The hot team re-uses the previous task team,
5194  // if untouched during the previous release->gather phase.
5195 
5196  KF_TRACE(10, (" hot_team = %p\n", team));
5197 
5198 #if KMP_DEBUG
5199  if (__kmp_tasking_mode != tskm_immediate_exec) {
5200  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5201  "task_team[1] = %p after reinit\n",
5202  team->t.t_task_team[0], team->t.t_task_team[1]));
5203  }
5204 #endif
5205 
5206 #if OMPT_SUPPORT
5207  __ompt_team_assign_id(team, ompt_parallel_data);
5208 #endif
5209 
5210  KMP_MB();
5211 
5212  return team;
5213  }
5214 
5215  /* next, let's try to take one from the team pool */
5216  KMP_MB();
5217  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5218  /* TODO: consider resizing undersized teams instead of reaping them, now
5219  that we have a resizing mechanism */
5220  if (team->t.t_max_nproc >= max_nproc) {
5221  /* take this team from the team pool */
5222  __kmp_team_pool = team->t.t_next_pool;
5223 
5224  /* setup the team for fresh use */
5225  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5226 
5227  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5228  "task_team[1] %p to NULL\n",
5229  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5230  team->t.t_task_team[0] = NULL;
5231  team->t.t_task_team[1] = NULL;
5232 
5233  /* reallocate space for arguments if necessary */
5234  __kmp_alloc_argv_entries(argc, team, TRUE);
5235  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5236 
5237  KA_TRACE(
5238  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5239  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5240  { // Initialize barrier data.
5241  int b;
5242  for (b = 0; b < bs_last_barrier; ++b) {
5243  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5244 #if USE_DEBUGGER
5245  team->t.t_bar[b].b_master_arrived = 0;
5246  team->t.t_bar[b].b_team_arrived = 0;
5247 #endif
5248  }
5249  }
5250 
5251 #if OMP_40_ENABLED
5252  team->t.t_proc_bind = new_proc_bind;
5253 #endif
5254 
5255  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5256  team->t.t_id));
5257 
5258 #if OMPT_SUPPORT
5259  __ompt_team_assign_id(team, ompt_parallel_data);
5260 #endif
5261 
5262  KMP_MB();
5263 
5264  return team;
5265  }
5266 
5267  /* reap team if it is too small, then loop back and check the next one */
5268  // not sure if this is wise, but, will be redone during the hot-teams
5269  // rewrite.
5270  /* TODO: Use technique to find the right size hot-team, don't reap them */
5271  team = __kmp_reap_team(team);
5272  __kmp_team_pool = team;
5273  }
5274 
5275  /* nothing available in the pool, no matter, make a new team! */
5276  KMP_MB();
5277  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5278 
5279  /* and set it up */
5280  team->t.t_max_nproc = max_nproc;
5281  /* NOTE well, for some reason allocating one big buffer and dividing it up
5282  seems to really hurt performance a lot on the P4, so, let's not use this */
5283  __kmp_allocate_team_arrays(team, max_nproc);
5284 
5285  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5286  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5287 
5288  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5289  "%p to NULL\n",
5290  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5291  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5292  // memory, no need to duplicate
5293  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5294  // memory, no need to duplicate
5295 
5296  if (__kmp_storage_map) {
5297  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5298  }
5299 
5300  /* allocate space for arguments */
5301  __kmp_alloc_argv_entries(argc, team, FALSE);
5302  team->t.t_argc = argc;
5303 
5304  KA_TRACE(20,
5305  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5306  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5307  { // Initialize barrier data.
5308  int b;
5309  for (b = 0; b < bs_last_barrier; ++b) {
5310  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5311 #if USE_DEBUGGER
5312  team->t.t_bar[b].b_master_arrived = 0;
5313  team->t.t_bar[b].b_team_arrived = 0;
5314 #endif
5315  }
5316  }
5317 
5318 #if OMP_40_ENABLED
5319  team->t.t_proc_bind = new_proc_bind;
5320 #endif
5321 
5322 #if OMPT_SUPPORT
5323  __ompt_team_assign_id(team, ompt_parallel_data);
5324  team->t.ompt_serialized_team_info = NULL;
5325 #endif
5326 
5327  KMP_MB();
5328 
5329  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5330  team->t.t_id));
5331 
5332  return team;
5333 }
5334 
5335 /* TODO implement hot-teams at all levels */
5336 /* TODO implement lazy thread release on demand (disband request) */
5337 
5338 /* free the team. return it to the team pool. release all the threads
5339  * associated with it */
5340 void __kmp_free_team(kmp_root_t *root,
5341  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5342  int f;
5343  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5344  team->t.t_id));
5345 
5346  /* verify state */
5347  KMP_DEBUG_ASSERT(root);
5348  KMP_DEBUG_ASSERT(team);
5349  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5350  KMP_DEBUG_ASSERT(team->t.t_threads);
5351 
5352  int use_hot_team = team == root->r.r_hot_team;
5353 #if KMP_NESTED_HOT_TEAMS
5354  int level;
5355  kmp_hot_team_ptr_t *hot_teams;
5356  if (master) {
5357  level = team->t.t_active_level - 1;
5358  if (master->th.th_teams_microtask) { // in teams construct?
5359  if (master->th.th_teams_size.nteams > 1) {
5360  ++level; // level was not increased in teams construct for
5361  // team_of_masters
5362  }
5363  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5364  master->th.th_teams_level == team->t.t_level) {
5365  ++level; // level was not increased in teams construct for
5366  // team_of_workers before the parallel
5367  } // team->t.t_level will be increased inside parallel
5368  }
5369  hot_teams = master->th.th_hot_teams;
5370  if (level < __kmp_hot_teams_max_level) {
5371  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5372  use_hot_team = 1;
5373  }
5374  }
5375 #endif // KMP_NESTED_HOT_TEAMS
5376 
5377  /* team is done working */
5378  TCW_SYNC_PTR(team->t.t_pkfn,
5379  NULL); // Important for Debugging Support Library.
5380 #if KMP_OS_WINDOWS
5381  team->t.t_copyin_counter = 0; // init counter for possible reuse
5382 #endif
5383  // Do not reset pointer to parent team to NULL for hot teams.
5384 
5385  /* if we are non-hot team, release our threads */
5386  if (!use_hot_team) {
5387  if (__kmp_tasking_mode != tskm_immediate_exec) {
5388  // Wait for threads to reach reapable state
5389  for (f = 1; f < team->t.t_nproc; ++f) {
5390  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5391  kmp_info_t *th = team->t.t_threads[f];
5392  volatile kmp_uint32 *state = &th->th.th_reap_state;
5393  while (*state != KMP_SAFE_TO_REAP) {
5394 #if KMP_OS_WINDOWS
5395  // On Windows a thread can be killed at any time, check this
5396  DWORD ecode;
5397  if (!__kmp_is_thread_alive(th, &ecode)) {
5398  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5399  break;
5400  }
5401 #endif
5402  // first check if thread is sleeping
5403  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5404  if (fl.is_sleeping())
5405  fl.resume(__kmp_gtid_from_thread(th));
5406  KMP_CPU_PAUSE();
5407  }
5408  }
5409 
5410  // Delete task teams
5411  int tt_idx;
5412  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5413  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5414  if (task_team != NULL) {
5415  for (f = 0; f < team->t.t_nproc;
5416  ++f) { // Have all threads unref task teams
5417  team->t.t_threads[f]->th.th_task_team = NULL;
5418  }
5419  KA_TRACE(
5420  20,
5421  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5422  __kmp_get_gtid(), task_team, team->t.t_id));
5423 #if KMP_NESTED_HOT_TEAMS
5424  __kmp_free_task_team(master, task_team);
5425 #endif
5426  team->t.t_task_team[tt_idx] = NULL;
5427  }
5428  }
5429  }
5430 
5431  // Reset pointer to parent team only for non-hot teams.
5432  team->t.t_parent = NULL;
5433  team->t.t_level = 0;
5434  team->t.t_active_level = 0;
5435 
5436  /* free the worker threads */
5437  for (f = 1; f < team->t.t_nproc; ++f) {
5438  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5439  __kmp_free_thread(team->t.t_threads[f]);
5440  team->t.t_threads[f] = NULL;
5441  }
5442 
5443  /* put the team back in the team pool */
5444  /* TODO limit size of team pool, call reap_team if pool too large */
5445  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5446  __kmp_team_pool = (volatile kmp_team_t *)team;
5447  }
5448 
5449  KMP_MB();
5450 }
5451 
5452 /* reap the team. destroy it, reclaim all its resources and free its memory */
5453 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5454  kmp_team_t *next_pool = team->t.t_next_pool;
5455 
5456  KMP_DEBUG_ASSERT(team);
5457  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5458  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5459  KMP_DEBUG_ASSERT(team->t.t_threads);
5460  KMP_DEBUG_ASSERT(team->t.t_argv);
5461 
5462  /* TODO clean the threads that are a part of this? */
5463 
5464  /* free stuff */
5465  __kmp_free_team_arrays(team);
5466  if (team->t.t_argv != &team->t.t_inline_argv[0])
5467  __kmp_free((void *)team->t.t_argv);
5468  __kmp_free(team);
5469 
5470  KMP_MB();
5471  return next_pool;
5472 }
5473 
5474 // Free the thread. Don't reap it, just place it on the pool of available
5475 // threads.
5476 //
5477 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5478 // binding for the affinity mechanism to be useful.
5479 //
5480 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5481 // However, we want to avoid a potential performance problem by always
5482 // scanning through the list to find the correct point at which to insert
5483 // the thread (potential N**2 behavior). To do this we keep track of the
5484 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5485 // With single-level parallelism, threads will always be added to the tail
5486 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5487 // parallelism, all bets are off and we may need to scan through the entire
5488 // free list.
5489 //
5490 // This change also has a potentially large performance benefit, for some
5491 // applications. Previously, as threads were freed from the hot team, they
5492 // would be placed back on the free list in inverse order. If the hot team
5493 // grew back to it's original size, then the freed thread would be placed
5494 // back on the hot team in reverse order. This could cause bad cache
5495 // locality problems on programs where the size of the hot team regularly
5496 // grew and shrunk.
5497 //
5498 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5499 void __kmp_free_thread(kmp_info_t *this_th) {
5500  int gtid;
5501  kmp_info_t **scan;
5502  kmp_root_t *root = this_th->th.th_root;
5503 
5504  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5505  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5506 
5507  KMP_DEBUG_ASSERT(this_th);
5508 
5509  // When moving thread to pool, switch thread to wait on own b_go flag, and
5510  // uninitialized (NULL team).
5511  int b;
5512  kmp_balign_t *balign = this_th->th.th_bar;
5513  for (b = 0; b < bs_last_barrier; ++b) {
5514  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5515  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5516  balign[b].bb.team = NULL;
5517  balign[b].bb.leaf_kids = 0;
5518  }
5519  this_th->th.th_task_state = 0;
5520  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5521 
5522  /* put thread back on the free pool */
5523  TCW_PTR(this_th->th.th_team, NULL);
5524  TCW_PTR(this_th->th.th_root, NULL);
5525  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5526 
5527  /* If the implicit task assigned to this thread can be used by other threads
5528  * -> multiple threads can share the data and try to free the task at
5529  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5530  * with higher probability when hot team is disabled but can occurs even when
5531  * the hot team is enabled */
5532  __kmp_free_implicit_task(this_th);
5533  this_th->th.th_current_task = NULL;
5534 
5535  // If the __kmp_thread_pool_insert_pt is already past the new insert
5536  // point, then we need to re-scan the entire list.
5537  gtid = this_th->th.th_info.ds.ds_gtid;
5538  if (__kmp_thread_pool_insert_pt != NULL) {
5539  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5540  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5541  __kmp_thread_pool_insert_pt = NULL;
5542  }
5543  }
5544 
5545  // Scan down the list to find the place to insert the thread.
5546  // scan is the address of a link in the list, possibly the address of
5547  // __kmp_thread_pool itself.
5548  //
5549  // In the absence of nested parallism, the for loop will have 0 iterations.
5550  if (__kmp_thread_pool_insert_pt != NULL) {
5551  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5552  } else {
5553  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5554  }
5555  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5556  scan = &((*scan)->th.th_next_pool))
5557  ;
5558 
5559  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5560  // to its address.
5561  TCW_PTR(this_th->th.th_next_pool, *scan);
5562  __kmp_thread_pool_insert_pt = *scan = this_th;
5563  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5564  (this_th->th.th_info.ds.ds_gtid <
5565  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5566  TCW_4(this_th->th.th_in_pool, TRUE);
5567  __kmp_thread_pool_nth++;
5568 
5569  TCW_4(__kmp_nth, __kmp_nth - 1);
5570  root->r.r_cg_nthreads--;
5571 
5572 #ifdef KMP_ADJUST_BLOCKTIME
5573  /* Adjust blocktime back to user setting or default if necessary */
5574  /* Middle initialization might never have occurred */
5575  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5576  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5577  if (__kmp_nth <= __kmp_avail_proc) {
5578  __kmp_zero_bt = FALSE;
5579  }
5580  }
5581 #endif /* KMP_ADJUST_BLOCKTIME */
5582 
5583  KMP_MB();
5584 }
5585 
5586 /* ------------------------------------------------------------------------ */
5587 
5588 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5589  int gtid = this_thr->th.th_info.ds.ds_gtid;
5590  /* void *stack_data;*/
5591  kmp_team_t *(*volatile pteam);
5592 
5593  KMP_MB();
5594  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5595 
5596  if (__kmp_env_consistency_check) {
5597  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5598  }
5599 
5600 #if OMPT_SUPPORT
5601  ompt_data_t *thread_data;
5602  if (ompt_enabled.enabled) {
5603  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5604  thread_data->ptr = NULL;
5605 
5606  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5607  this_thr->th.ompt_thread_info.wait_id = 0;
5608  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5609  if (ompt_enabled.ompt_callback_thread_begin) {
5610  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5611  ompt_thread_worker, thread_data);
5612  }
5613  }
5614 #endif
5615 
5616 #if OMPT_SUPPORT
5617  if (ompt_enabled.enabled) {
5618  this_thr->th.ompt_thread_info.state = omp_state_idle;
5619  }
5620 #endif
5621  /* This is the place where threads wait for work */
5622  while (!TCR_4(__kmp_global.g.g_done)) {
5623  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5624  KMP_MB();
5625 
5626  /* wait for work to do */
5627  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5628 
5629  /* No tid yet since not part of a team */
5630  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5631 
5632 #if OMPT_SUPPORT
5633  if (ompt_enabled.enabled) {
5634  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5635  }
5636 #endif
5637 
5638  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5639 
5640  /* have we been allocated? */
5641  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5642  /* we were just woken up, so run our new task */
5643  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5644  int rc;
5645  KA_TRACE(20,
5646  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5647  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5648  (*pteam)->t.t_pkfn));
5649 
5650  updateHWFPControl(*pteam);
5651 
5652 #if OMPT_SUPPORT
5653  if (ompt_enabled.enabled) {
5654  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5655  }
5656 #endif
5657 
5658  rc = (*pteam)->t.t_invoke(gtid);
5659  KMP_ASSERT(rc);
5660 
5661  KMP_MB();
5662  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5663  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5664  (*pteam)->t.t_pkfn));
5665  }
5666 #if OMPT_SUPPORT
5667  if (ompt_enabled.enabled) {
5668  /* no frame set while outside task */
5669  __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5670 
5671  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5672  }
5673 #endif
5674  /* join barrier after parallel region */
5675  __kmp_join_barrier(gtid);
5676  }
5677  }
5678  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5679 
5680 #if OMPT_SUPPORT
5681  if (ompt_enabled.ompt_callback_thread_end) {
5682  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5683  }
5684 #endif
5685 
5686  this_thr->th.th_task_team = NULL;
5687  /* run the destructors for the threadprivate data for this thread */
5688  __kmp_common_destroy_gtid(gtid);
5689 
5690  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5691  KMP_MB();
5692  return this_thr;
5693 }
5694 
5695 /* ------------------------------------------------------------------------ */
5696 
5697 void __kmp_internal_end_dest(void *specific_gtid) {
5698 #if KMP_COMPILER_ICC
5699 #pragma warning(push)
5700 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5701 // significant bits
5702 #endif
5703  // Make sure no significant bits are lost
5704  int gtid = (kmp_intptr_t)specific_gtid - 1;
5705 #if KMP_COMPILER_ICC
5706 #pragma warning(pop)
5707 #endif
5708 
5709  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5710  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5711  * this is because 0 is reserved for the nothing-stored case */
5712 
5713  /* josh: One reason for setting the gtid specific data even when it is being
5714  destroyed by pthread is to allow gtid lookup through thread specific data
5715  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5716  that gets executed in the call to __kmp_internal_end_thread, actually
5717  gets the gtid through the thread specific data. Setting it here seems
5718  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5719  to run smoothly.
5720  todo: get rid of this after we remove the dependence on
5721  __kmp_gtid_get_specific */
5722  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5723  __kmp_gtid_set_specific(gtid);
5724 #ifdef KMP_TDATA_GTID
5725  __kmp_gtid = gtid;
5726 #endif
5727  __kmp_internal_end_thread(gtid);
5728 }
5729 
5730 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5731 
5732 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5733 // destructors work perfectly, but in real libomp.so I have no evidence it is
5734 // ever called. However, -fini linker option in makefile.mk works fine.
5735 
5736 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5737  __kmp_internal_end_atexit();
5738 }
5739 
5740 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5741 
5742 #endif
5743 
5744 /* [Windows] josh: when the atexit handler is called, there may still be more
5745  than one thread alive */
5746 void __kmp_internal_end_atexit(void) {
5747  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5748  /* [Windows]
5749  josh: ideally, we want to completely shutdown the library in this atexit
5750  handler, but stat code that depends on thread specific data for gtid fails
5751  because that data becomes unavailable at some point during the shutdown, so
5752  we call __kmp_internal_end_thread instead. We should eventually remove the
5753  dependency on __kmp_get_specific_gtid in the stat code and use
5754  __kmp_internal_end_library to cleanly shutdown the library.
5755 
5756  // TODO: Can some of this comment about GVS be removed?
5757  I suspect that the offending stat code is executed when the calling thread
5758  tries to clean up a dead root thread's data structures, resulting in GVS
5759  code trying to close the GVS structures for that thread, but since the stat
5760  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5761  the calling thread is cleaning up itself instead of another thread, it get
5762  confused. This happens because allowing a thread to unregister and cleanup
5763  another thread is a recent modification for addressing an issue.
5764  Based on the current design (20050722), a thread may end up
5765  trying to unregister another thread only if thread death does not trigger
5766  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5767  thread specific data destructor function to detect thread death. For
5768  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5769  is nothing. Thus, the workaround is applicable only for Windows static
5770  stat library. */
5771  __kmp_internal_end_library(-1);
5772 #if KMP_OS_WINDOWS
5773  __kmp_close_console();
5774 #endif
5775 }
5776 
5777 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5778  // It is assumed __kmp_forkjoin_lock is acquired.
5779 
5780  int gtid;
5781 
5782  KMP_DEBUG_ASSERT(thread != NULL);
5783 
5784  gtid = thread->th.th_info.ds.ds_gtid;
5785 
5786  if (!is_root) {
5787 
5788  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5789  /* Assume the threads are at the fork barrier here */
5790  KA_TRACE(
5791  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5792  gtid));
5793  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5794  * (GEH) */
5795  ANNOTATE_HAPPENS_BEFORE(thread);
5796  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5797  __kmp_release_64(&flag);
5798  }
5799 
5800  // Terminate OS thread.
5801  __kmp_reap_worker(thread);
5802 
5803  // The thread was killed asynchronously. If it was actively
5804  // spinning in the thread pool, decrement the global count.
5805  //
5806  // There is a small timing hole here - if the worker thread was just waking
5807  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5808  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5809  // the global counter might not get updated.
5810  //
5811  // Currently, this can only happen as the library is unloaded,
5812  // so there are no harmful side effects.
5813  if (thread->th.th_active_in_pool) {
5814  thread->th.th_active_in_pool = FALSE;
5815  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5816  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5817  }
5818 
5819  // Decrement # of [worker] threads in the pool.
5820  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5821  --__kmp_thread_pool_nth;
5822  }
5823 
5824  __kmp_free_implicit_task(thread);
5825 
5826 // Free the fast memory for tasking
5827 #if USE_FAST_MEMORY
5828  __kmp_free_fast_memory(thread);
5829 #endif /* USE_FAST_MEMORY */
5830 
5831  __kmp_suspend_uninitialize_thread(thread);
5832 
5833  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5834  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5835 
5836  --__kmp_all_nth;
5837 // __kmp_nth was decremented when thread is added to the pool.
5838 
5839 #ifdef KMP_ADJUST_BLOCKTIME
5840  /* Adjust blocktime back to user setting or default if necessary */
5841  /* Middle initialization might never have occurred */
5842  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5843  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5844  if (__kmp_nth <= __kmp_avail_proc) {
5845  __kmp_zero_bt = FALSE;
5846  }
5847  }
5848 #endif /* KMP_ADJUST_BLOCKTIME */
5849 
5850  /* free the memory being used */
5851  if (__kmp_env_consistency_check) {
5852  if (thread->th.th_cons) {
5853  __kmp_free_cons_stack(thread->th.th_cons);
5854  thread->th.th_cons = NULL;
5855  }
5856  }
5857 
5858  if (thread->th.th_pri_common != NULL) {
5859  __kmp_free(thread->th.th_pri_common);
5860  thread->th.th_pri_common = NULL;
5861  }
5862 
5863  if (thread->th.th_task_state_memo_stack != NULL) {
5864  __kmp_free(thread->th.th_task_state_memo_stack);
5865  thread->th.th_task_state_memo_stack = NULL;
5866  }
5867 
5868 #if KMP_USE_BGET
5869  if (thread->th.th_local.bget_data != NULL) {
5870  __kmp_finalize_bget(thread);
5871  }
5872 #endif
5873 
5874 #if KMP_AFFINITY_SUPPORTED
5875  if (thread->th.th_affin_mask != NULL) {
5876  KMP_CPU_FREE(thread->th.th_affin_mask);
5877  thread->th.th_affin_mask = NULL;
5878  }
5879 #endif /* KMP_AFFINITY_SUPPORTED */
5880 
5881 #if KMP_USE_HIER_SCHED
5882  if (thread->th.th_hier_bar_data != NULL) {
5883  __kmp_free(thread->th.th_hier_bar_data);
5884  thread->th.th_hier_bar_data = NULL;
5885  }
5886 #endif
5887 
5888  __kmp_reap_team(thread->th.th_serial_team);
5889  thread->th.th_serial_team = NULL;
5890  __kmp_free(thread);
5891 
5892  KMP_MB();
5893 
5894 } // __kmp_reap_thread
5895 
5896 static void __kmp_internal_end(void) {
5897  int i;
5898 
5899  /* First, unregister the library */
5900  __kmp_unregister_library();
5901 
5902 #if KMP_OS_WINDOWS
5903  /* In Win static library, we can't tell when a root actually dies, so we
5904  reclaim the data structures for any root threads that have died but not
5905  unregistered themselves, in order to shut down cleanly.
5906  In Win dynamic library we also can't tell when a thread dies. */
5907  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5908 // dead roots
5909 #endif
5910 
5911  for (i = 0; i < __kmp_threads_capacity; i++)
5912  if (__kmp_root[i])
5913  if (__kmp_root[i]->r.r_active)
5914  break;
5915  KMP_MB(); /* Flush all pending memory write invalidates. */
5916  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5917 
5918  if (i < __kmp_threads_capacity) {
5919 #if KMP_USE_MONITOR
5920  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5921  KMP_MB(); /* Flush all pending memory write invalidates. */
5922 
5923  // Need to check that monitor was initialized before reaping it. If we are
5924  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5925  // __kmp_monitor will appear to contain valid data, but it is only valid in
5926  // the parent process, not the child.
5927  // New behavior (201008): instead of keying off of the flag
5928  // __kmp_init_parallel, the monitor thread creation is keyed off
5929  // of the new flag __kmp_init_monitor.
5930  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5931  if (TCR_4(__kmp_init_monitor)) {
5932  __kmp_reap_monitor(&__kmp_monitor);
5933  TCW_4(__kmp_init_monitor, 0);
5934  }
5935  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5936  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5937 #endif // KMP_USE_MONITOR
5938  } else {
5939 /* TODO move this to cleanup code */
5940 #ifdef KMP_DEBUG
5941  /* make sure that everything has properly ended */
5942  for (i = 0; i < __kmp_threads_capacity; i++) {
5943  if (__kmp_root[i]) {
5944  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5945  // there can be uber threads alive here
5946  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5947  }
5948  }
5949 #endif
5950 
5951  KMP_MB();
5952 
5953  // Reap the worker threads.
5954  // This is valid for now, but be careful if threads are reaped sooner.
5955  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5956  // Get the next thread from the pool.
5957  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5958  __kmp_thread_pool = thread->th.th_next_pool;
5959  // Reap it.
5960  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5961  thread->th.th_next_pool = NULL;
5962  thread->th.th_in_pool = FALSE;
5963  __kmp_reap_thread(thread, 0);
5964  }
5965  __kmp_thread_pool_insert_pt = NULL;
5966 
5967  // Reap teams.
5968  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5969  // Get the next team from the pool.
5970  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5971  __kmp_team_pool = team->t.t_next_pool;
5972  // Reap it.
5973  team->t.t_next_pool = NULL;
5974  __kmp_reap_team(team);
5975  }
5976 
5977  __kmp_reap_task_teams();
5978 
5979 #if KMP_OS_UNIX
5980  // Threads that are not reaped should not access any resources since they
5981  // are going to be deallocated soon, so the shutdown sequence should wait
5982  // until all threads either exit the final spin-waiting loop or begin
5983  // sleeping after the given blocktime.
5984  for (i = 0; i < __kmp_threads_capacity; i++) {
5985  kmp_info_t *thr = __kmp_threads[i];
5986  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
5987  KMP_CPU_PAUSE();
5988  }
5989 #endif
5990 
5991  for (i = 0; i < __kmp_threads_capacity; ++i) {
5992  // TBD: Add some checking...
5993  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5994  }
5995 
5996  /* Make sure all threadprivate destructors get run by joining with all
5997  worker threads before resetting this flag */
5998  TCW_SYNC_4(__kmp_init_common, FALSE);
5999 
6000  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6001  KMP_MB();
6002 
6003 #if KMP_USE_MONITOR
6004  // See note above: One of the possible fixes for CQ138434 / CQ140126
6005  //
6006  // FIXME: push both code fragments down and CSE them?
6007  // push them into __kmp_cleanup() ?
6008  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6009  if (TCR_4(__kmp_init_monitor)) {
6010  __kmp_reap_monitor(&__kmp_monitor);
6011  TCW_4(__kmp_init_monitor, 0);
6012  }
6013  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6014  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6015 #endif
6016  } /* else !__kmp_global.t_active */
6017  TCW_4(__kmp_init_gtid, FALSE);
6018  KMP_MB(); /* Flush all pending memory write invalidates. */
6019 
6020  __kmp_cleanup();
6021 #if OMPT_SUPPORT
6022  ompt_fini();
6023 #endif
6024 }
6025 
6026 void __kmp_internal_end_library(int gtid_req) {
6027  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6028  /* this shouldn't be a race condition because __kmp_internal_end() is the
6029  only place to clear __kmp_serial_init */
6030  /* we'll check this later too, after we get the lock */
6031  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6032  // redundaant, because the next check will work in any case.
6033  if (__kmp_global.g.g_abort) {
6034  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6035  /* TODO abort? */
6036  return;
6037  }
6038  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6039  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6040  return;
6041  }
6042 
6043  KMP_MB(); /* Flush all pending memory write invalidates. */
6044 
6045  /* find out who we are and what we should do */
6046  {
6047  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6048  KA_TRACE(
6049  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6050  if (gtid == KMP_GTID_SHUTDOWN) {
6051  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6052  "already shutdown\n"));
6053  return;
6054  } else if (gtid == KMP_GTID_MONITOR) {
6055  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6056  "registered, or system shutdown\n"));
6057  return;
6058  } else if (gtid == KMP_GTID_DNE) {
6059  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6060  "shutdown\n"));
6061  /* we don't know who we are, but we may still shutdown the library */
6062  } else if (KMP_UBER_GTID(gtid)) {
6063  /* unregister ourselves as an uber thread. gtid is no longer valid */
6064  if (__kmp_root[gtid]->r.r_active) {
6065  __kmp_global.g.g_abort = -1;
6066  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6067  KA_TRACE(10,
6068  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6069  gtid));
6070  return;
6071  } else {
6072  KA_TRACE(
6073  10,
6074  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6075  __kmp_unregister_root_current_thread(gtid);
6076  }
6077  } else {
6078 /* worker threads may call this function through the atexit handler, if they
6079  * call exit() */
6080 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6081  TODO: do a thorough shutdown instead */
6082 #ifdef DUMP_DEBUG_ON_EXIT
6083  if (__kmp_debug_buf)
6084  __kmp_dump_debug_buffer();
6085 #endif
6086  return;
6087  }
6088  }
6089  /* synchronize the termination process */
6090  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6091 
6092  /* have we already finished */
6093  if (__kmp_global.g.g_abort) {
6094  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6095  /* TODO abort? */
6096  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6097  return;
6098  }
6099  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6100  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6101  return;
6102  }
6103 
6104  /* We need this lock to enforce mutex between this reading of
6105  __kmp_threads_capacity and the writing by __kmp_register_root.
6106  Alternatively, we can use a counter of roots that is atomically updated by
6107  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6108  __kmp_internal_end_*. */
6109  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6110 
6111  /* now we can safely conduct the actual termination */
6112  __kmp_internal_end();
6113 
6114  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6115  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6116 
6117  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6118 
6119 #ifdef DUMP_DEBUG_ON_EXIT
6120  if (__kmp_debug_buf)
6121  __kmp_dump_debug_buffer();
6122 #endif
6123 
6124 #if KMP_OS_WINDOWS
6125  __kmp_close_console();
6126 #endif
6127 
6128  __kmp_fini_allocator();
6129 
6130 } // __kmp_internal_end_library
6131 
6132 void __kmp_internal_end_thread(int gtid_req) {
6133  int i;
6134 
6135  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6136  /* this shouldn't be a race condition because __kmp_internal_end() is the
6137  * only place to clear __kmp_serial_init */
6138  /* we'll check this later too, after we get the lock */
6139  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6140  // redundant, because the next check will work in any case.
6141  if (__kmp_global.g.g_abort) {
6142  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6143  /* TODO abort? */
6144  return;
6145  }
6146  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6147  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6148  return;
6149  }
6150 
6151  KMP_MB(); /* Flush all pending memory write invalidates. */
6152 
6153  /* find out who we are and what we should do */
6154  {
6155  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6156  KA_TRACE(10,
6157  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6158  if (gtid == KMP_GTID_SHUTDOWN) {
6159  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6160  "already shutdown\n"));
6161  return;
6162  } else if (gtid == KMP_GTID_MONITOR) {
6163  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6164  "registered, or system shutdown\n"));
6165  return;
6166  } else if (gtid == KMP_GTID_DNE) {
6167  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6168  "shutdown\n"));
6169  return;
6170  /* we don't know who we are */
6171  } else if (KMP_UBER_GTID(gtid)) {
6172  /* unregister ourselves as an uber thread. gtid is no longer valid */
6173  if (__kmp_root[gtid]->r.r_active) {
6174  __kmp_global.g.g_abort = -1;
6175  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6176  KA_TRACE(10,
6177  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6178  gtid));
6179  return;
6180  } else {
6181  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6182  gtid));
6183  __kmp_unregister_root_current_thread(gtid);
6184  }
6185  } else {
6186  /* just a worker thread, let's leave */
6187  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6188 
6189  if (gtid >= 0) {
6190  __kmp_threads[gtid]->th.th_task_team = NULL;
6191  }
6192 
6193  KA_TRACE(10,
6194  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6195  gtid));
6196  return;
6197  }
6198  }
6199 #if defined KMP_DYNAMIC_LIB
6200  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6201  // thread, because we will better shutdown later in the library destructor.
6202  // The reason of this change is performance problem when non-openmp thread in
6203  // a loop forks and joins many openmp threads. We can save a lot of time
6204  // keeping worker threads alive until the program shutdown.
6205  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6206  // and Windows(DPD200287443) that occurs when using critical sections from
6207  // foreign threads.
6208  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6209  return;
6210 #endif
6211  /* synchronize the termination process */
6212  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6213 
6214  /* have we already finished */
6215  if (__kmp_global.g.g_abort) {
6216  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6217  /* TODO abort? */
6218  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6219  return;
6220  }
6221  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6222  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6223  return;
6224  }
6225 
6226  /* We need this lock to enforce mutex between this reading of
6227  __kmp_threads_capacity and the writing by __kmp_register_root.
6228  Alternatively, we can use a counter of roots that is atomically updated by
6229  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6230  __kmp_internal_end_*. */
6231 
6232  /* should we finish the run-time? are all siblings done? */
6233  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6234 
6235  for (i = 0; i < __kmp_threads_capacity; ++i) {
6236  if (KMP_UBER_GTID(i)) {
6237  KA_TRACE(
6238  10,
6239  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6240  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6241  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6242  return;
6243  }
6244  }
6245 
6246  /* now we can safely conduct the actual termination */
6247 
6248  __kmp_internal_end();
6249 
6250  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6251  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6252 
6253  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6254 
6255 #ifdef DUMP_DEBUG_ON_EXIT
6256  if (__kmp_debug_buf)
6257  __kmp_dump_debug_buffer();
6258 #endif
6259 } // __kmp_internal_end_thread
6260 
6261 // -----------------------------------------------------------------------------
6262 // Library registration stuff.
6263 
6264 static long __kmp_registration_flag = 0;
6265 // Random value used to indicate library initialization.
6266 static char *__kmp_registration_str = NULL;
6267 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6268 
6269 static inline char *__kmp_reg_status_name() {
6270  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6271  each thread. If registration and unregistration go in different threads
6272  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6273  env var can not be found, because the name will contain different pid. */
6274  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6275 } // __kmp_reg_status_get
6276 
6277 void __kmp_register_library_startup(void) {
6278 
6279  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6280  int done = 0;
6281  union {
6282  double dtime;
6283  long ltime;
6284  } time;
6285 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6286  __kmp_initialize_system_tick();
6287 #endif
6288  __kmp_read_system_time(&time.dtime);
6289  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6290  __kmp_registration_str =
6291  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6292  __kmp_registration_flag, KMP_LIBRARY_FILE);
6293 
6294  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6295  __kmp_registration_str));
6296 
6297  while (!done) {
6298 
6299  char *value = NULL; // Actual value of the environment variable.
6300 
6301  // Set environment variable, but do not overwrite if it is exist.
6302  __kmp_env_set(name, __kmp_registration_str, 0);
6303  // Check the variable is written.
6304  value = __kmp_env_get(name);
6305  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6306 
6307  done = 1; // Ok, environment variable set successfully, exit the loop.
6308 
6309  } else {
6310 
6311  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6312  // Check whether it alive or dead.
6313  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6314  char *tail = value;
6315  char *flag_addr_str = NULL;
6316  char *flag_val_str = NULL;
6317  char const *file_name = NULL;
6318  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6319  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6320  file_name = tail;
6321  if (tail != NULL) {
6322  long *flag_addr = 0;
6323  long flag_val = 0;
6324  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6325  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6326  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6327  // First, check whether environment-encoded address is mapped into
6328  // addr space.
6329  // If so, dereference it to see if it still has the right value.
6330  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6331  neighbor = 1;
6332  } else {
6333  // If not, then we know the other copy of the library is no longer
6334  // running.
6335  neighbor = 2;
6336  }
6337  }
6338  }
6339  switch (neighbor) {
6340  case 0: // Cannot parse environment variable -- neighbor status unknown.
6341  // Assume it is the incompatible format of future version of the
6342  // library. Assume the other library is alive.
6343  // WARN( ... ); // TODO: Issue a warning.
6344  file_name = "unknown library";
6345  // Attention! Falling to the next case. That's intentional.
6346  case 1: { // Neighbor is alive.
6347  // Check it is allowed.
6348  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6349  if (!__kmp_str_match_true(duplicate_ok)) {
6350  // That's not allowed. Issue fatal error.
6351  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6352  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6353  }
6354  KMP_INTERNAL_FREE(duplicate_ok);
6355  __kmp_duplicate_library_ok = 1;
6356  done = 1; // Exit the loop.
6357  } break;
6358  case 2: { // Neighbor is dead.
6359  // Clear the variable and try to register library again.
6360  __kmp_env_unset(name);
6361  } break;
6362  default: { KMP_DEBUG_ASSERT(0); } break;
6363  }
6364  }
6365  KMP_INTERNAL_FREE((void *)value);
6366  }
6367  KMP_INTERNAL_FREE((void *)name);
6368 
6369 } // func __kmp_register_library_startup
6370 
6371 void __kmp_unregister_library(void) {
6372 
6373  char *name = __kmp_reg_status_name();
6374  char *value = __kmp_env_get(name);
6375 
6376  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6377  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6378  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6379  // Ok, this is our variable. Delete it.
6380  __kmp_env_unset(name);
6381  }
6382 
6383  KMP_INTERNAL_FREE(__kmp_registration_str);
6384  KMP_INTERNAL_FREE(value);
6385  KMP_INTERNAL_FREE(name);
6386 
6387  __kmp_registration_flag = 0;
6388  __kmp_registration_str = NULL;
6389 
6390 } // __kmp_unregister_library
6391 
6392 // End of Library registration stuff.
6393 // -----------------------------------------------------------------------------
6394 
6395 #if KMP_MIC_SUPPORTED
6396 
6397 static void __kmp_check_mic_type() {
6398  kmp_cpuid_t cpuid_state = {0};
6399  kmp_cpuid_t *cs_p = &cpuid_state;
6400  __kmp_x86_cpuid(1, 0, cs_p);
6401  // We don't support mic1 at the moment
6402  if ((cs_p->eax & 0xff0) == 0xB10) {
6403  __kmp_mic_type = mic2;
6404  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6405  __kmp_mic_type = mic3;
6406  } else {
6407  __kmp_mic_type = non_mic;
6408  }
6409 }
6410 
6411 #endif /* KMP_MIC_SUPPORTED */
6412 
6413 static void __kmp_do_serial_initialize(void) {
6414  int i, gtid;
6415  int size;
6416 
6417  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6418 
6419  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6420  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6421  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6422  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6423  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6424 
6425 #if OMPT_SUPPORT
6426  ompt_pre_init();
6427 #endif
6428 
6429  __kmp_validate_locks();
6430 
6431  /* Initialize internal memory allocator */
6432  __kmp_init_allocator();
6433 
6434  /* Register the library startup via an environment variable and check to see
6435  whether another copy of the library is already registered. */
6436 
6437  __kmp_register_library_startup();
6438 
6439  /* TODO reinitialization of library */
6440  if (TCR_4(__kmp_global.g.g_done)) {
6441  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6442  }
6443 
6444  __kmp_global.g.g_abort = 0;
6445  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6446 
6447 /* initialize the locks */
6448 #if KMP_USE_ADAPTIVE_LOCKS
6449 #if KMP_DEBUG_ADAPTIVE_LOCKS
6450  __kmp_init_speculative_stats();
6451 #endif
6452 #endif
6453 #if KMP_STATS_ENABLED
6454  __kmp_stats_init();
6455 #endif
6456  __kmp_init_lock(&__kmp_global_lock);
6457  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6458  __kmp_init_lock(&__kmp_debug_lock);
6459  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6460  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6461  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6462  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6463  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6464  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6465  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6466  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6467  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6468  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6469  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6470  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6471  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6472  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6473  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6474 #if KMP_USE_MONITOR
6475  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6476 #endif
6477  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6478 
6479  /* conduct initialization and initial setup of configuration */
6480 
6481  __kmp_runtime_initialize();
6482 
6483 #if KMP_MIC_SUPPORTED
6484  __kmp_check_mic_type();
6485 #endif
6486 
6487 // Some global variable initialization moved here from kmp_env_initialize()
6488 #ifdef KMP_DEBUG
6489  kmp_diag = 0;
6490 #endif
6491  __kmp_abort_delay = 0;
6492 
6493  // From __kmp_init_dflt_team_nth()
6494  /* assume the entire machine will be used */
6495  __kmp_dflt_team_nth_ub = __kmp_xproc;
6496  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6497  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6498  }
6499  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6500  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6501  }
6502  __kmp_max_nth = __kmp_sys_max_nth;
6503  __kmp_cg_max_nth = __kmp_sys_max_nth;
6504  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6505  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6506  __kmp_teams_max_nth = __kmp_sys_max_nth;
6507  }
6508 
6509  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6510  // part
6511  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6512 #if KMP_USE_MONITOR
6513  __kmp_monitor_wakeups =
6514  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6515  __kmp_bt_intervals =
6516  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6517 #endif
6518  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6519  __kmp_library = library_throughput;
6520  // From KMP_SCHEDULE initialization
6521  __kmp_static = kmp_sch_static_balanced;
6522 // AC: do not use analytical here, because it is non-monotonous
6523 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6524 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6525 // need to repeat assignment
6526 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6527 // bit control and barrier method control parts
6528 #if KMP_FAST_REDUCTION_BARRIER
6529 #define kmp_reduction_barrier_gather_bb ((int)1)
6530 #define kmp_reduction_barrier_release_bb ((int)1)
6531 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6532 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6533 #endif // KMP_FAST_REDUCTION_BARRIER
6534  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6535  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6536  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6537  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6538  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6539 #if KMP_FAST_REDUCTION_BARRIER
6540  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6541  // lin_64 ): hyper,1
6542  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6543  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6544  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6545  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6546  }
6547 #endif // KMP_FAST_REDUCTION_BARRIER
6548  }
6549 #if KMP_FAST_REDUCTION_BARRIER
6550 #undef kmp_reduction_barrier_release_pat
6551 #undef kmp_reduction_barrier_gather_pat
6552 #undef kmp_reduction_barrier_release_bb
6553 #undef kmp_reduction_barrier_gather_bb
6554 #endif // KMP_FAST_REDUCTION_BARRIER
6555 #if KMP_MIC_SUPPORTED
6556  if (__kmp_mic_type == mic2) { // KNC
6557  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6558  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6559  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6560  1; // forkjoin release
6561  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6562  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6563  }
6564 #if KMP_FAST_REDUCTION_BARRIER
6565  if (__kmp_mic_type == mic2) { // KNC
6566  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6567  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6568  }
6569 #endif // KMP_FAST_REDUCTION_BARRIER
6570 #endif // KMP_MIC_SUPPORTED
6571 
6572 // From KMP_CHECKS initialization
6573 #ifdef KMP_DEBUG
6574  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6575 #else
6576  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6577 #endif
6578 
6579  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6580  __kmp_foreign_tp = TRUE;
6581 
6582  __kmp_global.g.g_dynamic = FALSE;
6583  __kmp_global.g.g_dynamic_mode = dynamic_default;
6584 
6585  __kmp_env_initialize(NULL);
6586 
6587 // Print all messages in message catalog for testing purposes.
6588 #ifdef KMP_DEBUG
6589  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6590  if (__kmp_str_match_true(val)) {
6591  kmp_str_buf_t buffer;
6592  __kmp_str_buf_init(&buffer);
6593  __kmp_i18n_dump_catalog(&buffer);
6594  __kmp_printf("%s", buffer.str);
6595  __kmp_str_buf_free(&buffer);
6596  }
6597  __kmp_env_free(&val);
6598 #endif
6599 
6600  __kmp_threads_capacity =
6601  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6602  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6603  __kmp_tp_capacity = __kmp_default_tp_capacity(
6604  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6605 
6606  // If the library is shut down properly, both pools must be NULL. Just in
6607  // case, set them to NULL -- some memory may leak, but subsequent code will
6608  // work even if pools are not freed.
6609  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6610  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6611  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6612  __kmp_thread_pool = NULL;
6613  __kmp_thread_pool_insert_pt = NULL;
6614  __kmp_team_pool = NULL;
6615 
6616  /* Allocate all of the variable sized records */
6617  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6618  * expandable */
6619  /* Since allocation is cache-aligned, just add extra padding at the end */
6620  size =
6621  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6622  CACHE_LINE;
6623  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6624  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6625  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6626 
6627  /* init thread counts */
6628  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6629  0); // Asserts fail if the library is reinitializing and
6630  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6631  __kmp_all_nth = 0;
6632  __kmp_nth = 0;
6633 
6634  /* setup the uber master thread and hierarchy */
6635  gtid = __kmp_register_root(TRUE);
6636  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6637  KMP_ASSERT(KMP_UBER_GTID(gtid));
6638  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6639 
6640  KMP_MB(); /* Flush all pending memory write invalidates. */
6641 
6642  __kmp_common_initialize();
6643 
6644 #if KMP_OS_UNIX
6645  /* invoke the child fork handler */
6646  __kmp_register_atfork();
6647 #endif
6648 
6649 #if !defined KMP_DYNAMIC_LIB
6650  {
6651  /* Invoke the exit handler when the program finishes, only for static
6652  library. For dynamic library, we already have _fini and DllMain. */
6653  int rc = atexit(__kmp_internal_end_atexit);
6654  if (rc != 0) {
6655  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6656  __kmp_msg_null);
6657  }
6658  }
6659 #endif
6660 
6661 #if KMP_HANDLE_SIGNALS
6662 #if KMP_OS_UNIX
6663  /* NOTE: make sure that this is called before the user installs their own
6664  signal handlers so that the user handlers are called first. this way they
6665  can return false, not call our handler, avoid terminating the library, and
6666  continue execution where they left off. */
6667  __kmp_install_signals(FALSE);
6668 #endif /* KMP_OS_UNIX */
6669 #if KMP_OS_WINDOWS
6670  __kmp_install_signals(TRUE);
6671 #endif /* KMP_OS_WINDOWS */
6672 #endif
6673 
6674  /* we have finished the serial initialization */
6675  __kmp_init_counter++;
6676 
6677  __kmp_init_serial = TRUE;
6678 
6679  if (__kmp_settings) {
6680  __kmp_env_print();
6681  }
6682 
6683 #if OMP_40_ENABLED
6684  if (__kmp_display_env || __kmp_display_env_verbose) {
6685  __kmp_env_print_2();
6686  }
6687 #endif // OMP_40_ENABLED
6688 
6689 #if OMPT_SUPPORT
6690  ompt_post_init();
6691 #endif
6692 
6693  KMP_MB();
6694 
6695  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6696 }
6697 
6698 void __kmp_serial_initialize(void) {
6699  if (__kmp_init_serial) {
6700  return;
6701  }
6702  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6703  if (__kmp_init_serial) {
6704  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6705  return;
6706  }
6707  __kmp_do_serial_initialize();
6708  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6709 }
6710 
6711 static void __kmp_do_middle_initialize(void) {
6712  int i, j;
6713  int prev_dflt_team_nth;
6714 
6715  if (!__kmp_init_serial) {
6716  __kmp_do_serial_initialize();
6717  }
6718 
6719  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6720 
6721  // Save the previous value for the __kmp_dflt_team_nth so that
6722  // we can avoid some reinitialization if it hasn't changed.
6723  prev_dflt_team_nth = __kmp_dflt_team_nth;
6724 
6725 #if KMP_AFFINITY_SUPPORTED
6726  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6727  // number of cores on the machine.
6728  __kmp_affinity_initialize();
6729 
6730  // Run through the __kmp_threads array and set the affinity mask
6731  // for each root thread that is currently registered with the RTL.
6732  for (i = 0; i < __kmp_threads_capacity; i++) {
6733  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6734  __kmp_affinity_set_init_mask(i, TRUE);
6735  }
6736  }
6737 #endif /* KMP_AFFINITY_SUPPORTED */
6738 
6739  KMP_ASSERT(__kmp_xproc > 0);
6740  if (__kmp_avail_proc == 0) {
6741  __kmp_avail_proc = __kmp_xproc;
6742  }
6743 
6744  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6745  // correct them now
6746  j = 0;
6747  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6748  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6749  __kmp_avail_proc;
6750  j++;
6751  }
6752 
6753  if (__kmp_dflt_team_nth == 0) {
6754 #ifdef KMP_DFLT_NTH_CORES
6755  // Default #threads = #cores
6756  __kmp_dflt_team_nth = __kmp_ncores;
6757  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6758  "__kmp_ncores (%d)\n",
6759  __kmp_dflt_team_nth));
6760 #else
6761  // Default #threads = #available OS procs
6762  __kmp_dflt_team_nth = __kmp_avail_proc;
6763  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6764  "__kmp_avail_proc(%d)\n",
6765  __kmp_dflt_team_nth));
6766 #endif /* KMP_DFLT_NTH_CORES */
6767  }
6768 
6769  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6770  __kmp_dflt_team_nth = KMP_MIN_NTH;
6771  }
6772  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6773  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6774  }
6775 
6776  // There's no harm in continuing if the following check fails,
6777  // but it indicates an error in the previous logic.
6778  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6779 
6780  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6781  // Run through the __kmp_threads array and set the num threads icv for each
6782  // root thread that is currently registered with the RTL (which has not
6783  // already explicitly set its nthreads-var with a call to
6784  // omp_set_num_threads()).
6785  for (i = 0; i < __kmp_threads_capacity; i++) {
6786  kmp_info_t *thread = __kmp_threads[i];
6787  if (thread == NULL)
6788  continue;
6789  if (thread->th.th_current_task->td_icvs.nproc != 0)
6790  continue;
6791 
6792  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6793  }
6794  }
6795  KA_TRACE(
6796  20,
6797  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6798  __kmp_dflt_team_nth));
6799 
6800 #ifdef KMP_ADJUST_BLOCKTIME
6801  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6802  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6803  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6804  if (__kmp_nth > __kmp_avail_proc) {
6805  __kmp_zero_bt = TRUE;
6806  }
6807  }
6808 #endif /* KMP_ADJUST_BLOCKTIME */
6809 
6810  /* we have finished middle initialization */
6811  TCW_SYNC_4(__kmp_init_middle, TRUE);
6812 
6813  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6814 }
6815 
6816 void __kmp_middle_initialize(void) {
6817  if (__kmp_init_middle) {
6818  return;
6819  }
6820  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6821  if (__kmp_init_middle) {
6822  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6823  return;
6824  }
6825  __kmp_do_middle_initialize();
6826  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6827 }
6828 
6829 void __kmp_parallel_initialize(void) {
6830  int gtid = __kmp_entry_gtid(); // this might be a new root
6831 
6832  /* synchronize parallel initialization (for sibling) */
6833  if (TCR_4(__kmp_init_parallel))
6834  return;
6835  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6836  if (TCR_4(__kmp_init_parallel)) {
6837  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6838  return;
6839  }
6840 
6841  /* TODO reinitialization after we have already shut down */
6842  if (TCR_4(__kmp_global.g.g_done)) {
6843  KA_TRACE(
6844  10,
6845  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6846  __kmp_infinite_loop();
6847  }
6848 
6849  /* jc: The lock __kmp_initz_lock is already held, so calling
6850  __kmp_serial_initialize would cause a deadlock. So we call
6851  __kmp_do_serial_initialize directly. */
6852  if (!__kmp_init_middle) {
6853  __kmp_do_middle_initialize();
6854  }
6855 
6856  /* begin initialization */
6857  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6858  KMP_ASSERT(KMP_UBER_GTID(gtid));
6859 
6860 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6861  // Save the FP control regs.
6862  // Worker threads will set theirs to these values at thread startup.
6863  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6864  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6865  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6866 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6867 
6868 #if KMP_OS_UNIX
6869 #if KMP_HANDLE_SIGNALS
6870  /* must be after __kmp_serial_initialize */
6871  __kmp_install_signals(TRUE);
6872 #endif
6873 #endif
6874 
6875  __kmp_suspend_initialize();
6876 
6877 #if defined(USE_LOAD_BALANCE)
6878  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6879  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6880  }
6881 #else
6882  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6883  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6884  }
6885 #endif
6886 
6887  if (__kmp_version) {
6888  __kmp_print_version_2();
6889  }
6890 
6891  /* we have finished parallel initialization */
6892  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6893 
6894  KMP_MB();
6895  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6896 
6897  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6898 }
6899 
6900 /* ------------------------------------------------------------------------ */
6901 
6902 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6903  kmp_team_t *team) {
6904  kmp_disp_t *dispatch;
6905 
6906  KMP_MB();
6907 
6908  /* none of the threads have encountered any constructs, yet. */
6909  this_thr->th.th_local.this_construct = 0;
6910 #if KMP_CACHE_MANAGE
6911  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6912 #endif /* KMP_CACHE_MANAGE */
6913  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6914  KMP_DEBUG_ASSERT(dispatch);
6915  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6916  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6917  // this_thr->th.th_info.ds.ds_tid ] );
6918 
6919  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6920 #if OMP_45_ENABLED
6921  dispatch->th_doacross_buf_idx =
6922  0; /* reset the doacross dispatch buffer counter */
6923 #endif
6924  if (__kmp_env_consistency_check)
6925  __kmp_push_parallel(gtid, team->t.t_ident);
6926 
6927  KMP_MB(); /* Flush all pending memory write invalidates. */
6928 }
6929 
6930 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6931  kmp_team_t *team) {
6932  if (__kmp_env_consistency_check)
6933  __kmp_pop_parallel(gtid, team->t.t_ident);
6934 
6935  __kmp_finish_implicit_task(this_thr);
6936 }
6937 
6938 int __kmp_invoke_task_func(int gtid) {
6939  int rc;
6940  int tid = __kmp_tid_from_gtid(gtid);
6941  kmp_info_t *this_thr = __kmp_threads[gtid];
6942  kmp_team_t *team = this_thr->th.th_team;
6943 
6944  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6945 #if USE_ITT_BUILD
6946  if (__itt_stack_caller_create_ptr) {
6947  __kmp_itt_stack_callee_enter(
6948  (__itt_caller)
6949  team->t.t_stack_id); // inform ittnotify about entering user's code
6950  }
6951 #endif /* USE_ITT_BUILD */
6952 #if INCLUDE_SSC_MARKS
6953  SSC_MARK_INVOKING();
6954 #endif
6955 
6956 #if OMPT_SUPPORT
6957  void *dummy;
6958  void **exit_runtime_p;
6959  ompt_data_t *my_task_data;
6960  ompt_data_t *my_parallel_data;
6961  int ompt_team_size;
6962 
6963  if (ompt_enabled.enabled) {
6964  exit_runtime_p = &(
6965  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6966  } else {
6967  exit_runtime_p = &dummy;
6968  }
6969 
6970  my_task_data =
6971  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6972  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6973  if (ompt_enabled.ompt_callback_implicit_task) {
6974  ompt_team_size = team->t.t_nproc;
6975  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6976  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6977  __kmp_tid_from_gtid(gtid));
6978  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6979  }
6980 #endif
6981 
6982  {
6983  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6984  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6985  rc =
6986  __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6987  tid, (int)team->t.t_argc, (void **)team->t.t_argv
6988 #if OMPT_SUPPORT
6989  ,
6990  exit_runtime_p
6991 #endif
6992  );
6993 #if OMPT_SUPPORT
6994  *exit_runtime_p = NULL;
6995 #endif
6996  }
6997 
6998 #if USE_ITT_BUILD
6999  if (__itt_stack_caller_create_ptr) {
7000  __kmp_itt_stack_callee_leave(
7001  (__itt_caller)
7002  team->t.t_stack_id); // inform ittnotify about leaving user's code
7003  }
7004 #endif /* USE_ITT_BUILD */
7005  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7006 
7007  return rc;
7008 }
7009 
7010 #if OMP_40_ENABLED
7011 void __kmp_teams_master(int gtid) {
7012  // This routine is called by all master threads in teams construct
7013  kmp_info_t *thr = __kmp_threads[gtid];
7014  kmp_team_t *team = thr->th.th_team;
7015  ident_t *loc = team->t.t_ident;
7016  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7017  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7018  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7019  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7020  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7021 // Launch league of teams now, but not let workers execute
7022 // (they hang on fork barrier until next parallel)
7023 #if INCLUDE_SSC_MARKS
7024  SSC_MARK_FORKING();
7025 #endif
7026  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7027  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7028  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7029 #if INCLUDE_SSC_MARKS
7030  SSC_MARK_JOINING();
7031 #endif
7032 
7033  // AC: last parameter "1" eliminates join barrier which won't work because
7034  // worker threads are in a fork barrier waiting for more parallel regions
7035  __kmp_join_call(loc, gtid
7036 #if OMPT_SUPPORT
7037  ,
7038  fork_context_intel
7039 #endif
7040  ,
7041  1);
7042 }
7043 
7044 int __kmp_invoke_teams_master(int gtid) {
7045  kmp_info_t *this_thr = __kmp_threads[gtid];
7046  kmp_team_t *team = this_thr->th.th_team;
7047 #if KMP_DEBUG
7048  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7049  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7050  (void *)__kmp_teams_master);
7051 #endif
7052  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7053  __kmp_teams_master(gtid);
7054  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7055  return 1;
7056 }
7057 #endif /* OMP_40_ENABLED */
7058 
7059 /* this sets the requested number of threads for the next parallel region
7060  encountered by this team. since this should be enclosed in the forkjoin
7061  critical section it should avoid race conditions with assymmetrical nested
7062  parallelism */
7063 
7064 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7065  kmp_info_t *thr = __kmp_threads[gtid];
7066 
7067  if (num_threads > 0)
7068  thr->th.th_set_nproc = num_threads;
7069 }
7070 
7071 #if OMP_40_ENABLED
7072 
7073 /* this sets the requested number of teams for the teams region and/or
7074  the number of threads for the next parallel region encountered */
7075 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7076  int num_threads) {
7077  kmp_info_t *thr = __kmp_threads[gtid];
7078  KMP_DEBUG_ASSERT(num_teams >= 0);
7079  KMP_DEBUG_ASSERT(num_threads >= 0);
7080 
7081  if (num_teams == 0)
7082  num_teams = 1; // default number of teams is 1.
7083  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7084  if (!__kmp_reserve_warn) {
7085  __kmp_reserve_warn = 1;
7086  __kmp_msg(kmp_ms_warning,
7087  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7088  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7089  }
7090  num_teams = __kmp_teams_max_nth;
7091  }
7092  // Set number of teams (number of threads in the outer "parallel" of the
7093  // teams)
7094  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7095 
7096  // Remember the number of threads for inner parallel regions
7097  if (num_threads == 0) {
7098  if (!TCR_4(__kmp_init_middle))
7099  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7100  num_threads = __kmp_avail_proc / num_teams;
7101  if (num_teams * num_threads > __kmp_teams_max_nth) {
7102  // adjust num_threads w/o warning as it is not user setting
7103  num_threads = __kmp_teams_max_nth / num_teams;
7104  }
7105  } else {
7106  if (num_teams * num_threads > __kmp_teams_max_nth) {
7107  int new_threads = __kmp_teams_max_nth / num_teams;
7108  if (!__kmp_reserve_warn) { // user asked for too many threads
7109  __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7110  __kmp_msg(kmp_ms_warning,
7111  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7112  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7113  }
7114  num_threads = new_threads;
7115  }
7116  }
7117  thr->th.th_teams_size.nth = num_threads;
7118 }
7119 
7120 // Set the proc_bind var to use in the following parallel region.
7121 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7122  kmp_info_t *thr = __kmp_threads[gtid];
7123  thr->th.th_set_proc_bind = proc_bind;
7124 }
7125 
7126 #endif /* OMP_40_ENABLED */
7127 
7128 /* Launch the worker threads into the microtask. */
7129 
7130 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7131  kmp_info_t *this_thr = __kmp_threads[gtid];
7132 
7133 #ifdef KMP_DEBUG
7134  int f;
7135 #endif /* KMP_DEBUG */
7136 
7137  KMP_DEBUG_ASSERT(team);
7138  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7139  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7140  KMP_MB(); /* Flush all pending memory write invalidates. */
7141 
7142  team->t.t_construct = 0; /* no single directives seen yet */
7143  team->t.t_ordered.dt.t_value =
7144  0; /* thread 0 enters the ordered section first */
7145 
7146  /* Reset the identifiers on the dispatch buffer */
7147  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7148  if (team->t.t_max_nproc > 1) {
7149  int i;
7150  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7151  team->t.t_disp_buffer[i].buffer_index = i;
7152 #if OMP_45_ENABLED
7153  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7154 #endif
7155  }
7156  } else {
7157  team->t.t_disp_buffer[0].buffer_index = 0;
7158 #if OMP_45_ENABLED
7159  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7160 #endif
7161  }
7162 
7163  KMP_MB(); /* Flush all pending memory write invalidates. */
7164  KMP_ASSERT(this_thr->th.th_team == team);
7165 
7166 #ifdef KMP_DEBUG
7167  for (f = 0; f < team->t.t_nproc; f++) {
7168  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7169  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7170  }
7171 #endif /* KMP_DEBUG */
7172 
7173  /* release the worker threads so they may begin working */
7174  __kmp_fork_barrier(gtid, 0);
7175 }
7176 
7177 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7178  kmp_info_t *this_thr = __kmp_threads[gtid];
7179 
7180  KMP_DEBUG_ASSERT(team);
7181  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7182  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7183  KMP_MB(); /* Flush all pending memory write invalidates. */
7184 
7185 /* Join barrier after fork */
7186 
7187 #ifdef KMP_DEBUG
7188  if (__kmp_threads[gtid] &&
7189  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7190  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7191  __kmp_threads[gtid]);
7192  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7193  "team->t.t_nproc=%d\n",
7194  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7195  team->t.t_nproc);
7196  __kmp_print_structure();
7197  }
7198  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7199  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7200 #endif /* KMP_DEBUG */
7201 
7202  __kmp_join_barrier(gtid); /* wait for everyone */
7203 #if OMPT_SUPPORT
7204  if (ompt_enabled.enabled &&
7205  this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7206  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7207  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7208  this_thr->th.ompt_thread_info.state = omp_state_overhead;
7209 #if OMPT_OPTIONAL
7210  void *codeptr = NULL;
7211  if (KMP_MASTER_TID(ds_tid) &&
7212  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7213  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7214  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7215 
7216  if (ompt_enabled.ompt_callback_sync_region_wait) {
7217  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7218  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7219  }
7220  if (ompt_enabled.ompt_callback_sync_region) {
7221  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7222  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7223  }
7224 #endif
7225  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7226  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7227  ompt_scope_end, NULL, task_data, 0, ds_tid);
7228  }
7229  }
7230 #endif
7231 
7232  KMP_MB(); /* Flush all pending memory write invalidates. */
7233  KMP_ASSERT(this_thr->th.th_team == team);
7234 }
7235 
7236 /* ------------------------------------------------------------------------ */
7237 
7238 #ifdef USE_LOAD_BALANCE
7239 
7240 // Return the worker threads actively spinning in the hot team, if we
7241 // are at the outermost level of parallelism. Otherwise, return 0.
7242 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7243  int i;
7244  int retval;
7245  kmp_team_t *hot_team;
7246 
7247  if (root->r.r_active) {
7248  return 0;
7249  }
7250  hot_team = root->r.r_hot_team;
7251  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7252  return hot_team->t.t_nproc - 1; // Don't count master thread
7253  }
7254 
7255  // Skip the master thread - it is accounted for elsewhere.
7256  retval = 0;
7257  for (i = 1; i < hot_team->t.t_nproc; i++) {
7258  if (hot_team->t.t_threads[i]->th.th_active) {
7259  retval++;
7260  }
7261  }
7262  return retval;
7263 }
7264 
7265 // Perform an automatic adjustment to the number of
7266 // threads used by the next parallel region.
7267 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7268  int retval;
7269  int pool_active;
7270  int hot_team_active;
7271  int team_curr_active;
7272  int system_active;
7273 
7274  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7275  set_nproc));
7276  KMP_DEBUG_ASSERT(root);
7277  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7278  ->th.th_current_task->td_icvs.dynamic == TRUE);
7279  KMP_DEBUG_ASSERT(set_nproc > 1);
7280 
7281  if (set_nproc == 1) {
7282  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7283  return 1;
7284  }
7285 
7286  // Threads that are active in the thread pool, active in the hot team for this
7287  // particular root (if we are at the outer par level), and the currently
7288  // executing thread (to become the master) are available to add to the new
7289  // team, but are currently contributing to the system load, and must be
7290  // accounted for.
7291  pool_active = __kmp_thread_pool_active_nth;
7292  hot_team_active = __kmp_active_hot_team_nproc(root);
7293  team_curr_active = pool_active + hot_team_active + 1;
7294 
7295  // Check the system load.
7296  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7297  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7298  "hot team active = %d\n",
7299  system_active, pool_active, hot_team_active));
7300 
7301  if (system_active < 0) {
7302  // There was an error reading the necessary info from /proc, so use the
7303  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7304  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7305  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7306  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7307 
7308  // Make this call behave like the thread limit algorithm.
7309  retval = __kmp_avail_proc - __kmp_nth +
7310  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7311  if (retval > set_nproc) {
7312  retval = set_nproc;
7313  }
7314  if (retval < KMP_MIN_NTH) {
7315  retval = KMP_MIN_NTH;
7316  }
7317 
7318  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7319  retval));
7320  return retval;
7321  }
7322 
7323  // There is a slight delay in the load balance algorithm in detecting new
7324  // running procs. The real system load at this instant should be at least as
7325  // large as the #active omp thread that are available to add to the team.
7326  if (system_active < team_curr_active) {
7327  system_active = team_curr_active;
7328  }
7329  retval = __kmp_avail_proc - system_active + team_curr_active;
7330  if (retval > set_nproc) {
7331  retval = set_nproc;
7332  }
7333  if (retval < KMP_MIN_NTH) {
7334  retval = KMP_MIN_NTH;
7335  }
7336 
7337  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7338  return retval;
7339 } // __kmp_load_balance_nproc()
7340 
7341 #endif /* USE_LOAD_BALANCE */
7342 
7343 /* ------------------------------------------------------------------------ */
7344 
7345 /* NOTE: this is called with the __kmp_init_lock held */
7346 void __kmp_cleanup(void) {
7347  int f;
7348 
7349  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7350 
7351  if (TCR_4(__kmp_init_parallel)) {
7352 #if KMP_HANDLE_SIGNALS
7353  __kmp_remove_signals();
7354 #endif
7355  TCW_4(__kmp_init_parallel, FALSE);
7356  }
7357 
7358  if (TCR_4(__kmp_init_middle)) {
7359 #if KMP_AFFINITY_SUPPORTED
7360  __kmp_affinity_uninitialize();
7361 #endif /* KMP_AFFINITY_SUPPORTED */
7362  __kmp_cleanup_hierarchy();
7363  TCW_4(__kmp_init_middle, FALSE);
7364  }
7365 
7366  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7367 
7368  if (__kmp_init_serial) {
7369  __kmp_runtime_destroy();
7370  __kmp_init_serial = FALSE;
7371  }
7372 
7373  __kmp_cleanup_threadprivate_caches();
7374 
7375  for (f = 0; f < __kmp_threads_capacity; f++) {
7376  if (__kmp_root[f] != NULL) {
7377  __kmp_free(__kmp_root[f]);
7378  __kmp_root[f] = NULL;
7379  }
7380  }
7381  __kmp_free(__kmp_threads);
7382  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7383  // there is no need in freeing __kmp_root.
7384  __kmp_threads = NULL;
7385  __kmp_root = NULL;
7386  __kmp_threads_capacity = 0;
7387 
7388 #if KMP_USE_DYNAMIC_LOCK
7389  __kmp_cleanup_indirect_user_locks();
7390 #else
7391  __kmp_cleanup_user_locks();
7392 #endif
7393 
7394 #if KMP_AFFINITY_SUPPORTED
7395  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7396  __kmp_cpuinfo_file = NULL;
7397 #endif /* KMP_AFFINITY_SUPPORTED */
7398 
7399 #if KMP_USE_ADAPTIVE_LOCKS
7400 #if KMP_DEBUG_ADAPTIVE_LOCKS
7401  __kmp_print_speculative_stats();
7402 #endif
7403 #endif
7404  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7405  __kmp_nested_nth.nth = NULL;
7406  __kmp_nested_nth.size = 0;
7407  __kmp_nested_nth.used = 0;
7408  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7409  __kmp_nested_proc_bind.bind_types = NULL;
7410  __kmp_nested_proc_bind.size = 0;
7411  __kmp_nested_proc_bind.used = 0;
7412 
7413  __kmp_i18n_catclose();
7414 
7415 #if KMP_USE_HIER_SCHED
7416  __kmp_hier_scheds.deallocate();
7417 #endif
7418 
7419 #if KMP_STATS_ENABLED
7420  __kmp_stats_fini();
7421 #endif
7422 
7423  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7424 }
7425 
7426 /* ------------------------------------------------------------------------ */
7427 
7428 int __kmp_ignore_mppbeg(void) {
7429  char *env;
7430 
7431  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7432  if (__kmp_str_match_false(env))
7433  return FALSE;
7434  }
7435  // By default __kmpc_begin() is no-op.
7436  return TRUE;
7437 }
7438 
7439 int __kmp_ignore_mppend(void) {
7440  char *env;
7441 
7442  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7443  if (__kmp_str_match_false(env))
7444  return FALSE;
7445  }
7446  // By default __kmpc_end() is no-op.
7447  return TRUE;
7448 }
7449 
7450 void __kmp_internal_begin(void) {
7451  int gtid;
7452  kmp_root_t *root;
7453 
7454  /* this is a very important step as it will register new sibling threads
7455  and assign these new uber threads a new gtid */
7456  gtid = __kmp_entry_gtid();
7457  root = __kmp_threads[gtid]->th.th_root;
7458  KMP_ASSERT(KMP_UBER_GTID(gtid));
7459 
7460  if (root->r.r_begin)
7461  return;
7462  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7463  if (root->r.r_begin) {
7464  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7465  return;
7466  }
7467 
7468  root->r.r_begin = TRUE;
7469 
7470  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7471 }
7472 
7473 /* ------------------------------------------------------------------------ */
7474 
7475 void __kmp_user_set_library(enum library_type arg) {
7476  int gtid;
7477  kmp_root_t *root;
7478  kmp_info_t *thread;
7479 
7480  /* first, make sure we are initialized so we can get our gtid */
7481 
7482  gtid = __kmp_entry_gtid();
7483  thread = __kmp_threads[gtid];
7484 
7485  root = thread->th.th_root;
7486 
7487  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7488  library_serial));
7489  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7490  thread */
7491  KMP_WARNING(SetLibraryIncorrectCall);
7492  return;
7493  }
7494 
7495  switch (arg) {
7496  case library_serial:
7497  thread->th.th_set_nproc = 0;
7498  set__nproc(thread, 1);
7499  break;
7500  case library_turnaround:
7501  thread->th.th_set_nproc = 0;
7502  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7503  : __kmp_dflt_team_nth_ub);
7504  break;
7505  case library_throughput:
7506  thread->th.th_set_nproc = 0;
7507  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7508  : __kmp_dflt_team_nth_ub);
7509  break;
7510  default:
7511  KMP_FATAL(UnknownLibraryType, arg);
7512  }
7513 
7514  __kmp_aux_set_library(arg);
7515 }
7516 
7517 void __kmp_aux_set_stacksize(size_t arg) {
7518  if (!__kmp_init_serial)
7519  __kmp_serial_initialize();
7520 
7521 #if KMP_OS_DARWIN
7522  if (arg & (0x1000 - 1)) {
7523  arg &= ~(0x1000 - 1);
7524  if (arg + 0x1000) /* check for overflow if we round up */
7525  arg += 0x1000;
7526  }
7527 #endif
7528  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7529 
7530  /* only change the default stacksize before the first parallel region */
7531  if (!TCR_4(__kmp_init_parallel)) {
7532  size_t value = arg; /* argument is in bytes */
7533 
7534  if (value < __kmp_sys_min_stksize)
7535  value = __kmp_sys_min_stksize;
7536  else if (value > KMP_MAX_STKSIZE)
7537  value = KMP_MAX_STKSIZE;
7538 
7539  __kmp_stksize = value;
7540 
7541  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7542  }
7543 
7544  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7545 }
7546 
7547 /* set the behaviour of the runtime library */
7548 /* TODO this can cause some odd behaviour with sibling parallelism... */
7549 void __kmp_aux_set_library(enum library_type arg) {
7550  __kmp_library = arg;
7551 
7552  switch (__kmp_library) {
7553  case library_serial: {
7554  KMP_INFORM(LibraryIsSerial);
7555  (void)__kmp_change_library(TRUE);
7556  } break;
7557  case library_turnaround:
7558  (void)__kmp_change_library(TRUE);
7559  break;
7560  case library_throughput:
7561  (void)__kmp_change_library(FALSE);
7562  break;
7563  default:
7564  KMP_FATAL(UnknownLibraryType, arg);
7565  }
7566 }
7567 
7568 /* ------------------------------------------------------------------------ */
7569 
7570 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7571  int blocktime = arg; /* argument is in milliseconds */
7572 #if KMP_USE_MONITOR
7573  int bt_intervals;
7574 #endif
7575  int bt_set;
7576 
7577  __kmp_save_internal_controls(thread);
7578 
7579  /* Normalize and set blocktime for the teams */
7580  if (blocktime < KMP_MIN_BLOCKTIME)
7581  blocktime = KMP_MIN_BLOCKTIME;
7582  else if (blocktime > KMP_MAX_BLOCKTIME)
7583  blocktime = KMP_MAX_BLOCKTIME;
7584 
7585  set__blocktime_team(thread->th.th_team, tid, blocktime);
7586  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7587 
7588 #if KMP_USE_MONITOR
7589  /* Calculate and set blocktime intervals for the teams */
7590  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7591 
7592  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7593  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7594 #endif
7595 
7596  /* Set whether blocktime has been set to "TRUE" */
7597  bt_set = TRUE;
7598 
7599  set__bt_set_team(thread->th.th_team, tid, bt_set);
7600  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7601 #if KMP_USE_MONITOR
7602  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7603  "bt_intervals=%d, monitor_updates=%d\n",
7604  __kmp_gtid_from_tid(tid, thread->th.th_team),
7605  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7606  __kmp_monitor_wakeups));
7607 #else
7608  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7609  __kmp_gtid_from_tid(tid, thread->th.th_team),
7610  thread->th.th_team->t.t_id, tid, blocktime));
7611 #endif
7612 }
7613 
7614 void __kmp_aux_set_defaults(char const *str, int len) {
7615  if (!__kmp_init_serial) {
7616  __kmp_serial_initialize();
7617  }
7618  __kmp_env_initialize(str);
7619 
7620  if (__kmp_settings
7621 #if OMP_40_ENABLED
7622  || __kmp_display_env || __kmp_display_env_verbose
7623 #endif // OMP_40_ENABLED
7624  ) {
7625  __kmp_env_print();
7626  }
7627 } // __kmp_aux_set_defaults
7628 
7629 /* ------------------------------------------------------------------------ */
7630 /* internal fast reduction routines */
7631 
7632 PACKED_REDUCTION_METHOD_T
7633 __kmp_determine_reduction_method(
7634  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7635  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7636  kmp_critical_name *lck) {
7637 
7638  // Default reduction method: critical construct ( lck != NULL, like in current
7639  // PAROPT )
7640  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7641  // can be selected by RTL
7642  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7643  // can be selected by RTL
7644  // Finally, it's up to OpenMP RTL to make a decision on which method to select
7645  // among generated by PAROPT.
7646 
7647  PACKED_REDUCTION_METHOD_T retval;
7648 
7649  int team_size;
7650 
7651  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7652  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7653 
7654 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
7655  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7656 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7657 
7658  retval = critical_reduce_block;
7659 
7660  // another choice of getting a team size (with 1 dynamic deference) is slower
7661  team_size = __kmp_get_team_num_threads(global_tid);
7662  if (team_size == 1) {
7663 
7664  retval = empty_reduce_block;
7665 
7666  } else {
7667 
7668  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7669 
7670 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7671 
7672 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || \
7673  KMP_OS_DARWIN
7674 
7675  int teamsize_cutoff = 4;
7676 
7677 #if KMP_MIC_SUPPORTED
7678  if (__kmp_mic_type != non_mic) {
7679  teamsize_cutoff = 8;
7680  }
7681 #endif
7682  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7683  if (tree_available) {
7684  if (team_size <= teamsize_cutoff) {
7685  if (atomic_available) {
7686  retval = atomic_reduce_block;
7687  }
7688  } else {
7689  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7690  }
7691  } else if (atomic_available) {
7692  retval = atomic_reduce_block;
7693  }
7694 #else
7695 #error "Unknown or unsupported OS"
7696 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7697 // KMP_OS_DARWIN
7698 
7699 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7700 
7701 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7702 
7703  // basic tuning
7704 
7705  if (atomic_available) {
7706  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7707  retval = atomic_reduce_block;
7708  }
7709  } // otherwise: use critical section
7710 
7711 #elif KMP_OS_DARWIN
7712 
7713  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7714  if (atomic_available && (num_vars <= 3)) {
7715  retval = atomic_reduce_block;
7716  } else if (tree_available) {
7717  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7718  (reduce_size < (2000 * sizeof(kmp_real64)))) {
7719  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7720  }
7721  } // otherwise: use critical section
7722 
7723 #else
7724 #error "Unknown or unsupported OS"
7725 #endif
7726 
7727 #else
7728 #error "Unknown or unsupported architecture"
7729 #endif
7730  }
7731 
7732  // KMP_FORCE_REDUCTION
7733 
7734  // If the team is serialized (team_size == 1), ignore the forced reduction
7735  // method and stay with the unsynchronized method (empty_reduce_block)
7736  if (__kmp_force_reduction_method != reduction_method_not_defined &&
7737  team_size != 1) {
7738 
7739  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7740 
7741  int atomic_available, tree_available;
7742 
7743  switch ((forced_retval = __kmp_force_reduction_method)) {
7744  case critical_reduce_block:
7745  KMP_ASSERT(lck); // lck should be != 0
7746  break;
7747 
7748  case atomic_reduce_block:
7749  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7750  if (!atomic_available) {
7751  KMP_WARNING(RedMethodNotSupported, "atomic");
7752  forced_retval = critical_reduce_block;
7753  }
7754  break;
7755 
7756  case tree_reduce_block:
7757  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7758  if (!tree_available) {
7759  KMP_WARNING(RedMethodNotSupported, "tree");
7760  forced_retval = critical_reduce_block;
7761  } else {
7762 #if KMP_FAST_REDUCTION_BARRIER
7763  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7764 #endif
7765  }
7766  break;
7767 
7768  default:
7769  KMP_ASSERT(0); // "unsupported method specified"
7770  }
7771 
7772  retval = forced_retval;
7773  }
7774 
7775  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7776 
7777 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7778 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7779 
7780  return (retval);
7781 }
7782 
7783 // this function is for testing set/get/determine reduce method
7784 kmp_int32 __kmp_get_reduce_method(void) {
7785  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7786 }
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:877
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:919
sched_type
Definition: kmp.h:332
Definition: kmp.h:219
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
kmp_int32 flags
Definition: kmp.h:221