LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 #include "kmp_dispatch.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 
36 /* these are temporary issues to be dealt with */
37 #define KMP_USE_PRCTL 0
38 
39 #if KMP_OS_WINDOWS
40 #include <process.h>
41 #endif
42 
43 #include "tsan_annotations.h"
44 
45 #if defined(KMP_GOMP_COMPAT)
46 char const __kmp_version_alt_comp[] =
47  KMP_VERSION_PREFIX "alternative compiler support: yes";
48 #endif /* defined(KMP_GOMP_COMPAT) */
49 
50 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
51 #if OMP_50_ENABLED
52  "5.0 (201611)";
53 #elif OMP_45_ENABLED
54  "4.5 (201511)";
55 #elif OMP_40_ENABLED
56  "4.0 (201307)";
57 #else
58  "3.1 (201107)";
59 #endif
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63  KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79  int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81  kmp_internal_control_t *new_icvs,
82  ident_t *loc);
83 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85  int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91  kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109  int i;
110  kmp_info_t **other_threads;
111  size_t stack_data;
112  char *stack_addr;
113  size_t stack_size;
114  char *stack_base;
115 
116  KA_TRACE(
117  1000,
118  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
119  __kmp_nth, __kmp_all_nth));
120 
121  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124  __kmp_init_gtid for this to work. */
125 
126  if (!TCR_4(__kmp_init_gtid))
127  return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130  if (TCR_4(__kmp_gtid_mode) >= 3) {
131  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132  return __kmp_gtid;
133  }
134 #endif
135  if (TCR_4(__kmp_gtid_mode) >= 2) {
136  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137  return __kmp_gtid_get_specific();
138  }
139  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141  stack_addr = (char *)&stack_data;
142  other_threads = __kmp_threads;
143 
144  /* ATT: The code below is a source of potential bugs due to unsynchronized
145  access to __kmp_threads array. For example:
146  1. Current thread loads other_threads[i] to thr and checks it, it is
147  non-NULL.
148  2. Current thread is suspended by OS.
149  3. Another thread unregisters and finishes (debug versions of free()
150  may fill memory with something like 0xEF).
151  4. Current thread is resumed.
152  5. Current thread reads junk from *thr.
153  TODO: Fix it. --ln */
154 
155  for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158  if (!thr)
159  continue;
160 
161  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164  /* stack grows down -- search through all of the active threads */
165 
166  if (stack_addr <= stack_base) {
167  size_t stack_diff = stack_base - stack_addr;
168 
169  if (stack_diff <= stack_size) {
170  /* The only way we can be closer than the allocated */
171  /* stack size is if we are running on this thread. */
172  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173  return i;
174  }
175  }
176  }
177 
178  /* get specific to try and determine our gtid */
179  KA_TRACE(1000,
180  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181  "thread, using TLS\n"));
182  i = __kmp_gtid_get_specific();
183 
184  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
185 
186  /* if we havn't been assigned a gtid, then return code */
187  if (i < 0)
188  return i;
189 
190  /* dynamically updated stack window for uber threads to avoid get_specific
191  call */
192  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193  KMP_FATAL(StackOverflow, i);
194  }
195 
196  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197  if (stack_addr > stack_base) {
198  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201  stack_base);
202  } else {
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204  stack_base - stack_addr);
205  }
206 
207  /* Reprint stack bounds for ubermaster since they have been refined */
208  if (__kmp_storage_map) {
209  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212  other_threads[i]->th.th_info.ds.ds_stacksize,
213  "th_%d stack (refinement)", i);
214  }
215  return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219  int gtid;
220 
221  if (!__kmp_init_serial) {
222  gtid = KMP_GTID_DNE;
223  } else
224 #ifdef KMP_TDATA_GTID
225  if (TCR_4(__kmp_gtid_mode) >= 3) {
226  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227  gtid = __kmp_gtid;
228  } else
229 #endif
230  if (TCR_4(__kmp_gtid_mode) >= 2) {
231  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232  gtid = __kmp_gtid_get_specific();
233  } else {
234  KA_TRACE(1000,
235  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236  gtid = __kmp_get_global_thread_id();
237  }
238 
239  /* we must be a new uber master sibling thread */
240  if (gtid == KMP_GTID_DNE) {
241  KA_TRACE(10,
242  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243  "Registering a new gtid.\n"));
244  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245  if (!__kmp_init_serial) {
246  __kmp_do_serial_initialize();
247  gtid = __kmp_gtid_get_specific();
248  } else {
249  gtid = __kmp_register_root(FALSE);
250  }
251  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253  }
254 
255  KMP_DEBUG_ASSERT(gtid >= 0);
256 
257  return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262  int f;
263  char *stack_beg = NULL;
264  char *stack_end = NULL;
265  int gtid;
266 
267  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268  if (__kmp_storage_map) {
269  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272  gtid = __kmp_gtid_from_thread(th);
273 
274  if (gtid == KMP_GTID_MONITOR) {
275  __kmp_print_storage_map_gtid(
276  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277  "th_%s stack (%s)", "mon",
278  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279  } else {
280  __kmp_print_storage_map_gtid(
281  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282  "th_%d stack (%s)", gtid,
283  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284  }
285  }
286 
287  /* No point in checking ubermaster threads since they use refinement and
288  * cannot overlap */
289  gtid = __kmp_gtid_from_thread(th);
290  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291  KA_TRACE(10,
292  ("__kmp_check_stack_overlap: performing extensive checking\n"));
293  if (stack_beg == NULL) {
294  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296  }
297 
298  for (f = 0; f < __kmp_threads_capacity; f++) {
299  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301  if (f_th && f_th != th) {
302  char *other_stack_end =
303  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304  char *other_stack_beg =
305  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309  /* Print the other stack values before the abort */
310  if (__kmp_storage_map)
311  __kmp_print_storage_map_gtid(
312  -1, other_stack_beg, other_stack_end,
313  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317  __kmp_msg_null);
318  }
319  }
320  }
321  }
322  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328  static int done = FALSE;
329 
330  while (!done) {
331  KMP_YIELD(1);
332  }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338  char const *format, ...) {
339  char buffer[MAX_MESSAGE];
340  va_list ap;
341 
342  va_start(ap, format);
343  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344  p2, (unsigned long)size, format);
345  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346  __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348  int node;
349  if (gtid >= 0) {
350  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351  if (__kmp_storage_map_verbose) {
352  node = __kmp_get_host_node(p1);
353  if (node < 0) /* doesn't work, so don't try this next time */
354  __kmp_storage_map_verbose = FALSE;
355  else {
356  char *last;
357  int lastNode;
358  int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360  const int page_size = KMP_GET_PAGE_SIZE();
361 
362  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364  if (localProc >= 0)
365  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
366  localProc >> 1);
367  else
368  __kmp_printf_no_lock(" GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370  /* The more elaborate format is disabled for now because of the prctl
371  * hanging bug. */
372  do {
373  last = p1;
374  lastNode = node;
375  /* This loop collates adjacent pages with the same host node. */
376  do {
377  (char *)p1 += page_size;
378  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
380  lastNode);
381  } while (p1 <= p2);
382 #else
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
384  (char *)p1 + (page_size - 1),
385  __kmp_get_host_node(p1));
386  if (p1 < p2) {
387  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
388  (char *)p2 + (page_size - 1),
389  __kmp_get_host_node(p2));
390  }
391 #endif
392  }
393  }
394  } else
395  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
396  }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402  char buffer[MAX_MESSAGE];
403  va_list ap;
404 
405  if (__kmp_generate_warnings == kmp_warnings_off) {
406  return;
407  }
408 
409  va_start(ap, format);
410 
411  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413  __kmp_vprintf(kmp_err, buffer, ap);
414  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416  va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420  // Later threads may stall here, but that's ok because abort() will kill them.
421  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423  if (__kmp_debug_buf) {
424  __kmp_dump_debug_buffer();
425  }
426 
427  if (KMP_OS_WINDOWS) {
428  // Let other threads know of abnormal termination and prevent deadlock
429  // if abort happened during library initialization or shutdown
430  __kmp_global.g.g_abort = SIGABRT;
431 
432  /* On Windows* OS by default abort() causes pop-up error box, which stalls
433  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434  boxes. _set_abort_behavior() works well, but this function is not
435  available in VS7 (this is not problem for DLL, but it is a problem for
436  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437  help, at least in some versions of MS C RTL.
438 
439  It seems following sequence is the only way to simulate abort() and
440  avoid pop-up error box. */
441  raise(SIGABRT);
442  _exit(3); // Just in case, if signal ignored, exit anyway.
443  } else {
444  abort();
445  }
446 
447  __kmp_infinite_loop();
448  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453  // TODO: Eliminate g_abort global variable and this function.
454  // In case of abort just call abort(), it will kill all the threads.
455  __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459  that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463  gtid);
464 
465  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469  sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471  __kmp_print_storage_map_gtid(
472  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476  &thr->th.th_bar[bs_plain_barrier + 1],
477  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478  gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481  &thr->th.th_bar[bs_forkjoin_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483  gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487  &thr->th.th_bar[bs_reduction_barrier + 1],
488  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489  gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494  that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497  int team_id, int num_thr) {
498  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500  header, team_id);
501 
502  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503  &team->t.t_bar[bs_last_barrier],
504  sizeof(kmp_balign_team_t) * bs_last_barrier,
505  "%s_%d.t_bar", header, team_id);
506 
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508  &team->t.t_bar[bs_plain_barrier + 1],
509  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513  &team->t.t_bar[bs_forkjoin_barrier + 1],
514  sizeof(kmp_balign_team_t),
515  "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519  &team->t.t_bar[bs_reduction_barrier + 1],
520  sizeof(kmp_balign_team_t),
521  "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524  __kmp_print_storage_map_gtid(
525  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528  __kmp_print_storage_map_gtid(
529  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533  &team->t.t_disp_buffer[num_disp_buff],
534  sizeof(dispatch_shared_info_t) * num_disp_buff,
535  "%s_%d.t_disp_buffer", header, team_id);
536 
537  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
538  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
539  team_id);
540 }
541 
542 static void __kmp_init_allocator() {}
543 static void __kmp_fini_allocator() {}
544 
545 /* ------------------------------------------------------------------------ */
546 
547 #ifdef KMP_DYNAMIC_LIB
548 #if KMP_OS_WINDOWS
549 
550 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
551  // TODO: Change to __kmp_break_bootstrap_lock().
552  __kmp_init_bootstrap_lock(lck); // make the lock released
553 }
554 
555 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
556  int i;
557  int thread_count;
558 
559  // PROCESS_DETACH is expected to be called by a thread that executes
560  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
561  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
562  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
563  // threads can be still alive here, although being about to be terminated. The
564  // threads in the array with ds_thread==0 are most suspicious. Actually, it
565  // can be not safe to access the __kmp_threads[].
566 
567  // TODO: does it make sense to check __kmp_roots[] ?
568 
569  // Let's check that there are no other alive threads registered with the OMP
570  // lib.
571  while (1) {
572  thread_count = 0;
573  for (i = 0; i < __kmp_threads_capacity; ++i) {
574  if (!__kmp_threads)
575  continue;
576  kmp_info_t *th = __kmp_threads[i];
577  if (th == NULL)
578  continue;
579  int gtid = th->th.th_info.ds.ds_gtid;
580  if (gtid == gtid_req)
581  continue;
582  if (gtid < 0)
583  continue;
584  DWORD exit_val;
585  int alive = __kmp_is_thread_alive(th, &exit_val);
586  if (alive) {
587  ++thread_count;
588  }
589  }
590  if (thread_count == 0)
591  break; // success
592  }
593 
594  // Assume that I'm alone. Now it might be safe to check and reset locks.
595  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
596  __kmp_reset_lock(&__kmp_forkjoin_lock);
597 #ifdef KMP_DEBUG
598  __kmp_reset_lock(&__kmp_stdio_lock);
599 #endif // KMP_DEBUG
600 }
601 
602 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
603  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
604 
605  switch (fdwReason) {
606 
607  case DLL_PROCESS_ATTACH:
608  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
609 
610  return TRUE;
611 
612  case DLL_PROCESS_DETACH:
613  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
614 
615  if (lpReserved != NULL) {
616  // lpReserved is used for telling the difference:
617  // lpReserved == NULL when FreeLibrary() was called,
618  // lpReserved != NULL when the process terminates.
619  // When FreeLibrary() is called, worker threads remain alive. So they will
620  // release the forkjoin lock by themselves. When the process terminates,
621  // worker threads disappear triggering the problem of unreleased forkjoin
622  // lock as described below.
623 
624  // A worker thread can take the forkjoin lock. The problem comes up if
625  // that worker thread becomes dead before it releases the forkjoin lock.
626  // The forkjoin lock remains taken, while the thread executing
627  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
628  // to take the forkjoin lock and will always fail, so that the application
629  // will never finish [normally]. This scenario is possible if
630  // __kmpc_end() has not been executed. It looks like it's not a corner
631  // case, but common cases:
632  // - the main function was compiled by an alternative compiler;
633  // - the main function was compiled by icl but without /Qopenmp
634  // (application with plugins);
635  // - application terminates by calling C exit(), Fortran CALL EXIT() or
636  // Fortran STOP.
637  // - alive foreign thread prevented __kmpc_end from doing cleanup.
638  //
639  // This is a hack to work around the problem.
640  // TODO: !!! figure out something better.
641  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
642  }
643 
644  __kmp_internal_end_library(__kmp_gtid_get_specific());
645 
646  return TRUE;
647 
648  case DLL_THREAD_ATTACH:
649  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
650 
651  /* if we want to register new siblings all the time here call
652  * __kmp_get_gtid(); */
653  return TRUE;
654 
655  case DLL_THREAD_DETACH:
656  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
657 
658  __kmp_internal_end_thread(__kmp_gtid_get_specific());
659  return TRUE;
660  }
661 
662  return TRUE;
663 }
664 
665 #endif /* KMP_OS_WINDOWS */
666 #endif /* KMP_DYNAMIC_LIB */
667 
668 /* Change the library type to "status" and return the old type */
669 /* called from within initialization routines where __kmp_initz_lock is held */
670 int __kmp_change_library(int status) {
671  int old_status;
672 
673  old_status = __kmp_yield_init &
674  1; // check whether KMP_LIBRARY=throughput (even init count)
675 
676  if (status) {
677  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
678  } else {
679  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
680  }
681 
682  return old_status; // return previous setting of whether
683  // KMP_LIBRARY=throughput
684 }
685 
686 /* __kmp_parallel_deo -- Wait until it's our turn. */
687 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
688  int gtid = *gtid_ref;
689 #ifdef BUILD_PARALLEL_ORDERED
690  kmp_team_t *team = __kmp_team_from_gtid(gtid);
691 #endif /* BUILD_PARALLEL_ORDERED */
692 
693  if (__kmp_env_consistency_check) {
694  if (__kmp_threads[gtid]->th.th_root->r.r_active)
695 #if KMP_USE_DYNAMIC_LOCK
696  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
697 #else
698  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
699 #endif
700  }
701 #ifdef BUILD_PARALLEL_ORDERED
702  if (!team->t.t_serialized) {
703  KMP_MB();
704  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
705  KMP_EQ, NULL);
706  KMP_MB();
707  }
708 #endif /* BUILD_PARALLEL_ORDERED */
709 }
710 
711 /* __kmp_parallel_dxo -- Signal the next task. */
712 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
713  int gtid = *gtid_ref;
714 #ifdef BUILD_PARALLEL_ORDERED
715  int tid = __kmp_tid_from_gtid(gtid);
716  kmp_team_t *team = __kmp_team_from_gtid(gtid);
717 #endif /* BUILD_PARALLEL_ORDERED */
718 
719  if (__kmp_env_consistency_check) {
720  if (__kmp_threads[gtid]->th.th_root->r.r_active)
721  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
722  }
723 #ifdef BUILD_PARALLEL_ORDERED
724  if (!team->t.t_serialized) {
725  KMP_MB(); /* Flush all pending memory write invalidates. */
726 
727  /* use the tid of the next thread in this team */
728  /* TODO replace with general release procedure */
729  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
730 
731  KMP_MB(); /* Flush all pending memory write invalidates. */
732  }
733 #endif /* BUILD_PARALLEL_ORDERED */
734 }
735 
736 /* ------------------------------------------------------------------------ */
737 /* The BARRIER for a SINGLE process section is always explicit */
738 
739 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
740  int status;
741  kmp_info_t *th;
742  kmp_team_t *team;
743 
744  if (!TCR_4(__kmp_init_parallel))
745  __kmp_parallel_initialize();
746 
747  th = __kmp_threads[gtid];
748  team = th->th.th_team;
749  status = 0;
750 
751  th->th.th_ident = id_ref;
752 
753  if (team->t.t_serialized) {
754  status = 1;
755  } else {
756  kmp_int32 old_this = th->th.th_local.this_construct;
757 
758  ++th->th.th_local.this_construct;
759  /* try to set team count to thread count--success means thread got the
760  single block */
761  /* TODO: Should this be acquire or release? */
762  if (team->t.t_construct == old_this) {
763  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
764  th->th.th_local.this_construct);
765  }
766 #if USE_ITT_BUILD
767  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
768  KMP_MASTER_GTID(gtid) &&
769 #if OMP_40_ENABLED
770  th->th.th_teams_microtask == NULL &&
771 #endif
772  team->t.t_active_level ==
773  1) { // Only report metadata by master of active team at level 1
774  __kmp_itt_metadata_single(id_ref);
775  }
776 #endif /* USE_ITT_BUILD */
777  }
778 
779  if (__kmp_env_consistency_check) {
780  if (status && push_ws) {
781  __kmp_push_workshare(gtid, ct_psingle, id_ref);
782  } else {
783  __kmp_check_workshare(gtid, ct_psingle, id_ref);
784  }
785  }
786 #if USE_ITT_BUILD
787  if (status) {
788  __kmp_itt_single_start(gtid);
789  }
790 #endif /* USE_ITT_BUILD */
791  return status;
792 }
793 
794 void __kmp_exit_single(int gtid) {
795 #if USE_ITT_BUILD
796  __kmp_itt_single_end(gtid);
797 #endif /* USE_ITT_BUILD */
798  if (__kmp_env_consistency_check)
799  __kmp_pop_workshare(gtid, ct_psingle, NULL);
800 }
801 
802 /* determine if we can go parallel or must use a serialized parallel region and
803  * how many threads we can use
804  * set_nproc is the number of threads requested for the team
805  * returns 0 if we should serialize or only use one thread,
806  * otherwise the number of threads to use
807  * The forkjoin lock is held by the caller. */
808 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
809  int master_tid, int set_nthreads
810 #if OMP_40_ENABLED
811  ,
812  int enter_teams
813 #endif /* OMP_40_ENABLED */
814  ) {
815  int capacity;
816  int new_nthreads;
817  KMP_DEBUG_ASSERT(__kmp_init_serial);
818  KMP_DEBUG_ASSERT(root && parent_team);
819 
820  // If dyn-var is set, dynamically adjust the number of desired threads,
821  // according to the method specified by dynamic_mode.
822  new_nthreads = set_nthreads;
823  if (!get__dynamic_2(parent_team, master_tid)) {
824  ;
825  }
826 #ifdef USE_LOAD_BALANCE
827  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
828  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
829  if (new_nthreads == 1) {
830  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831  "reservation to 1 thread\n",
832  master_tid));
833  return 1;
834  }
835  if (new_nthreads < set_nthreads) {
836  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
837  "reservation to %d threads\n",
838  master_tid, new_nthreads));
839  }
840  }
841 #endif /* USE_LOAD_BALANCE */
842  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
843  new_nthreads = __kmp_avail_proc - __kmp_nth +
844  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
845  if (new_nthreads <= 1) {
846  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847  "reservation to 1 thread\n",
848  master_tid));
849  return 1;
850  }
851  if (new_nthreads < set_nthreads) {
852  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
853  "reservation to %d threads\n",
854  master_tid, new_nthreads));
855  } else {
856  new_nthreads = set_nthreads;
857  }
858  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
859  if (set_nthreads > 2) {
860  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
861  new_nthreads = (new_nthreads % set_nthreads) + 1;
862  if (new_nthreads == 1) {
863  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864  "reservation to 1 thread\n",
865  master_tid));
866  return 1;
867  }
868  if (new_nthreads < set_nthreads) {
869  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
870  "reservation to %d threads\n",
871  master_tid, new_nthreads));
872  }
873  }
874  } else {
875  KMP_ASSERT(0);
876  }
877 
878  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
879  if (__kmp_nth + new_nthreads -
880  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
881  __kmp_max_nth) {
882  int tl_nthreads = __kmp_max_nth - __kmp_nth +
883  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
884  if (tl_nthreads <= 0) {
885  tl_nthreads = 1;
886  }
887 
888  // If dyn-var is false, emit a 1-time warning.
889  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
890  __kmp_reserve_warn = 1;
891  __kmp_msg(kmp_ms_warning,
892  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
893  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
894  }
895  if (tl_nthreads == 1) {
896  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
897  "reduced reservation to 1 thread\n",
898  master_tid));
899  return 1;
900  }
901  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
902  "reservation to %d threads\n",
903  master_tid, tl_nthreads));
904  new_nthreads = tl_nthreads;
905  }
906 
907  // Respect OMP_THREAD_LIMIT
908  if (root->r.r_cg_nthreads + new_nthreads -
909  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
910  __kmp_cg_max_nth) {
911  int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
912  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
913  if (tl_nthreads <= 0) {
914  tl_nthreads = 1;
915  }
916 
917  // If dyn-var is false, emit a 1-time warning.
918  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
919  __kmp_reserve_warn = 1;
920  __kmp_msg(kmp_ms_warning,
921  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
922  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
923  }
924  if (tl_nthreads == 1) {
925  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
926  "reduced reservation to 1 thread\n",
927  master_tid));
928  return 1;
929  }
930  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
931  "reservation to %d threads\n",
932  master_tid, tl_nthreads));
933  new_nthreads = tl_nthreads;
934  }
935 
936  // Check if the threads array is large enough, or needs expanding.
937  // See comment in __kmp_register_root() about the adjustment if
938  // __kmp_threads[0] == NULL.
939  capacity = __kmp_threads_capacity;
940  if (TCR_PTR(__kmp_threads[0]) == NULL) {
941  --capacity;
942  }
943  if (__kmp_nth + new_nthreads -
944  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
945  capacity) {
946  // Expand the threads array.
947  int slotsRequired = __kmp_nth + new_nthreads -
948  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
949  capacity;
950  int slotsAdded = __kmp_expand_threads(slotsRequired);
951  if (slotsAdded < slotsRequired) {
952  // The threads array was not expanded enough.
953  new_nthreads -= (slotsRequired - slotsAdded);
954  KMP_ASSERT(new_nthreads >= 1);
955 
956  // If dyn-var is false, emit a 1-time warning.
957  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
958  __kmp_reserve_warn = 1;
959  if (__kmp_tp_cached) {
960  __kmp_msg(kmp_ms_warning,
961  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
962  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
963  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
964  } else {
965  __kmp_msg(kmp_ms_warning,
966  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
967  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
968  }
969  }
970  }
971  }
972 
973 #ifdef KMP_DEBUG
974  if (new_nthreads == 1) {
975  KC_TRACE(10,
976  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
977  "dead roots and rechecking; requested %d threads\n",
978  __kmp_get_gtid(), set_nthreads));
979  } else {
980  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
981  " %d threads\n",
982  __kmp_get_gtid(), new_nthreads, set_nthreads));
983  }
984 #endif // KMP_DEBUG
985  return new_nthreads;
986 }
987 
988 /* Allocate threads from the thread pool and assign them to the new team. We are
989  assured that there are enough threads available, because we checked on that
990  earlier within critical section forkjoin */
991 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
992  kmp_info_t *master_th, int master_gtid) {
993  int i;
994  int use_hot_team;
995 
996  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
997  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
998  KMP_MB();
999 
1000  /* first, let's setup the master thread */
1001  master_th->th.th_info.ds.ds_tid = 0;
1002  master_th->th.th_team = team;
1003  master_th->th.th_team_nproc = team->t.t_nproc;
1004  master_th->th.th_team_master = master_th;
1005  master_th->th.th_team_serialized = FALSE;
1006  master_th->th.th_dispatch = &team->t.t_dispatch[0];
1007 
1008 /* make sure we are not the optimized hot team */
1009 #if KMP_NESTED_HOT_TEAMS
1010  use_hot_team = 0;
1011  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1012  if (hot_teams) { // hot teams array is not allocated if
1013  // KMP_HOT_TEAMS_MAX_LEVEL=0
1014  int level = team->t.t_active_level - 1; // index in array of hot teams
1015  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1016  if (master_th->th.th_teams_size.nteams > 1) {
1017  ++level; // level was not increased in teams construct for
1018  // team_of_masters
1019  }
1020  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1021  master_th->th.th_teams_level == team->t.t_level) {
1022  ++level; // level was not increased in teams construct for
1023  // team_of_workers before the parallel
1024  } // team->t.t_level will be increased inside parallel
1025  }
1026  if (level < __kmp_hot_teams_max_level) {
1027  if (hot_teams[level].hot_team) {
1028  // hot team has already been allocated for given level
1029  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1030  use_hot_team = 1; // the team is ready to use
1031  } else {
1032  use_hot_team = 0; // AC: threads are not allocated yet
1033  hot_teams[level].hot_team = team; // remember new hot team
1034  hot_teams[level].hot_team_nth = team->t.t_nproc;
1035  }
1036  } else {
1037  use_hot_team = 0;
1038  }
1039  }
1040 #else
1041  use_hot_team = team == root->r.r_hot_team;
1042 #endif
1043  if (!use_hot_team) {
1044 
1045  /* install the master thread */
1046  team->t.t_threads[0] = master_th;
1047  __kmp_initialize_info(master_th, team, 0, master_gtid);
1048 
1049  /* now, install the worker threads */
1050  for (i = 1; i < team->t.t_nproc; i++) {
1051 
1052  /* fork or reallocate a new thread and install it in team */
1053  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1054  team->t.t_threads[i] = thr;
1055  KMP_DEBUG_ASSERT(thr);
1056  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1057  /* align team and thread arrived states */
1058  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1059  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1060  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1061  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1062  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1063  team->t.t_bar[bs_plain_barrier].b_arrived));
1064 #if OMP_40_ENABLED
1065  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1066  thr->th.th_teams_level = master_th->th.th_teams_level;
1067  thr->th.th_teams_size = master_th->th.th_teams_size;
1068 #endif
1069  { // Initialize threads' barrier data.
1070  int b;
1071  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1072  for (b = 0; b < bs_last_barrier; ++b) {
1073  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1074  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1075 #if USE_DEBUGGER
1076  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1077 #endif
1078  }
1079  }
1080  }
1081 
1082 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1083  __kmp_partition_places(team);
1084 #endif
1085  }
1086 
1087  KMP_MB();
1088 }
1089 
1090 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1091 // Propagate any changes to the floating point control registers out to the team
1092 // We try to avoid unnecessary writes to the relevant cache line in the team
1093 // structure, so we don't make changes unless they are needed.
1094 inline static void propagateFPControl(kmp_team_t *team) {
1095  if (__kmp_inherit_fp_control) {
1096  kmp_int16 x87_fpu_control_word;
1097  kmp_uint32 mxcsr;
1098 
1099  // Get master values of FPU control flags (both X87 and vector)
1100  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1101  __kmp_store_mxcsr(&mxcsr);
1102  mxcsr &= KMP_X86_MXCSR_MASK;
1103 
1104  // There is no point looking at t_fp_control_saved here.
1105  // If it is TRUE, we still have to update the values if they are different
1106  // from those we now have. If it is FALSE we didn't save anything yet, but
1107  // our objective is the same. We have to ensure that the values in the team
1108  // are the same as those we have.
1109  // So, this code achieves what we need whether or not t_fp_control_saved is
1110  // true. By checking whether the value needs updating we avoid unnecessary
1111  // writes that would put the cache-line into a written state, causing all
1112  // threads in the team to have to read it again.
1113  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1114  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1115  // Although we don't use this value, other code in the runtime wants to know
1116  // whether it should restore them. So we must ensure it is correct.
1117  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1118  } else {
1119  // Similarly here. Don't write to this cache-line in the team structure
1120  // unless we have to.
1121  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1122  }
1123 }
1124 
1125 // Do the opposite, setting the hardware registers to the updated values from
1126 // the team.
1127 inline static void updateHWFPControl(kmp_team_t *team) {
1128  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1129  // Only reset the fp control regs if they have been changed in the team.
1130  // the parallel region that we are exiting.
1131  kmp_int16 x87_fpu_control_word;
1132  kmp_uint32 mxcsr;
1133  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1134  __kmp_store_mxcsr(&mxcsr);
1135  mxcsr &= KMP_X86_MXCSR_MASK;
1136 
1137  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1138  __kmp_clear_x87_fpu_status_word();
1139  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1140  }
1141 
1142  if (team->t.t_mxcsr != mxcsr) {
1143  __kmp_load_mxcsr(&team->t.t_mxcsr);
1144  }
1145  }
1146 }
1147 #else
1148 #define propagateFPControl(x) ((void)0)
1149 #define updateHWFPControl(x) ((void)0)
1150 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1151 
1152 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1153  int realloc); // forward declaration
1154 
1155 /* Run a parallel region that has been serialized, so runs only in a team of the
1156  single master thread. */
1157 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1158  kmp_info_t *this_thr;
1159  kmp_team_t *serial_team;
1160 
1161  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1162 
1163  /* Skip all this code for autopar serialized loops since it results in
1164  unacceptable overhead */
1165  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1166  return;
1167 
1168  if (!TCR_4(__kmp_init_parallel))
1169  __kmp_parallel_initialize();
1170 
1171  this_thr = __kmp_threads[global_tid];
1172  serial_team = this_thr->th.th_serial_team;
1173 
1174  /* utilize the serialized team held by this thread */
1175  KMP_DEBUG_ASSERT(serial_team);
1176  KMP_MB();
1177 
1178  if (__kmp_tasking_mode != tskm_immediate_exec) {
1179  KMP_DEBUG_ASSERT(
1180  this_thr->th.th_task_team ==
1181  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1182  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1183  NULL);
1184  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1185  "team %p, new task_team = NULL\n",
1186  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1187  this_thr->th.th_task_team = NULL;
1188  }
1189 
1190 #if OMP_40_ENABLED
1191  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1192  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1193  proc_bind = proc_bind_false;
1194  } else if (proc_bind == proc_bind_default) {
1195  // No proc_bind clause was specified, so use the current value
1196  // of proc-bind-var for this parallel region.
1197  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1198  }
1199  // Reset for next parallel region
1200  this_thr->th.th_set_proc_bind = proc_bind_default;
1201 #endif /* OMP_40_ENABLED */
1202 
1203 #if OMPT_SUPPORT
1204  ompt_data_t ompt_parallel_data;
1205  ompt_parallel_data.ptr = NULL;
1206  ompt_data_t *implicit_task_data;
1207  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1208  if (ompt_enabled.enabled &&
1209  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1210 
1211  ompt_task_info_t *parent_task_info;
1212  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1213 
1214  parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1215  if (ompt_enabled.ompt_callback_parallel_begin) {
1216  int team_size = 1;
1217 
1218  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1219  &(parent_task_info->task_data), &(parent_task_info->frame),
1220  &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1221  }
1222  }
1223 #endif // OMPT_SUPPORT
1224 
1225  if (this_thr->th.th_team != serial_team) {
1226  // Nested level will be an index in the nested nthreads array
1227  int level = this_thr->th.th_team->t.t_level;
1228 
1229  if (serial_team->t.t_serialized) {
1230  /* this serial team was already used
1231  TODO increase performance by making this locks more specific */
1232  kmp_team_t *new_team;
1233 
1234  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1235 
1236  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1237 #if OMPT_SUPPORT
1238  ompt_parallel_data,
1239 #endif
1240 #if OMP_40_ENABLED
1241  proc_bind,
1242 #endif
1243  &this_thr->th.th_current_task->td_icvs,
1244  0 USE_NESTED_HOT_ARG(NULL));
1245  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1246  KMP_ASSERT(new_team);
1247 
1248  /* setup new serialized team and install it */
1249  new_team->t.t_threads[0] = this_thr;
1250  new_team->t.t_parent = this_thr->th.th_team;
1251  serial_team = new_team;
1252  this_thr->th.th_serial_team = serial_team;
1253 
1254  KF_TRACE(
1255  10,
1256  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1257  global_tid, serial_team));
1258 
1259  /* TODO the above breaks the requirement that if we run out of resources,
1260  then we can still guarantee that serialized teams are ok, since we may
1261  need to allocate a new one */
1262  } else {
1263  KF_TRACE(
1264  10,
1265  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1266  global_tid, serial_team));
1267  }
1268 
1269  /* we have to initialize this serial team */
1270  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1271  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1272  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1273  serial_team->t.t_ident = loc;
1274  serial_team->t.t_serialized = 1;
1275  serial_team->t.t_nproc = 1;
1276  serial_team->t.t_parent = this_thr->th.th_team;
1277  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1278  this_thr->th.th_team = serial_team;
1279  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1280 
1281  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1282  this_thr->th.th_current_task));
1283  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1284  this_thr->th.th_current_task->td_flags.executing = 0;
1285 
1286  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1287 
1288  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1289  implicit task for each serialized task represented by
1290  team->t.t_serialized? */
1291  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1292  &this_thr->th.th_current_task->td_parent->td_icvs);
1293 
1294  // Thread value exists in the nested nthreads array for the next nested
1295  // level
1296  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1297  this_thr->th.th_current_task->td_icvs.nproc =
1298  __kmp_nested_nth.nth[level + 1];
1299  }
1300 
1301 #if OMP_40_ENABLED
1302  if (__kmp_nested_proc_bind.used &&
1303  (level + 1 < __kmp_nested_proc_bind.used)) {
1304  this_thr->th.th_current_task->td_icvs.proc_bind =
1305  __kmp_nested_proc_bind.bind_types[level + 1];
1306  }
1307 #endif /* OMP_40_ENABLED */
1308 
1309 #if USE_DEBUGGER
1310  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1311 #endif
1312  this_thr->th.th_info.ds.ds_tid = 0;
1313 
1314  /* set thread cache values */
1315  this_thr->th.th_team_nproc = 1;
1316  this_thr->th.th_team_master = this_thr;
1317  this_thr->th.th_team_serialized = 1;
1318 
1319  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1320  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1321 
1322  propagateFPControl(serial_team);
1323 
1324  /* check if we need to allocate dispatch buffers stack */
1325  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1326  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1327  serial_team->t.t_dispatch->th_disp_buffer =
1328  (dispatch_private_info_t *)__kmp_allocate(
1329  sizeof(dispatch_private_info_t));
1330  }
1331  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1332 
1333  KMP_MB();
1334 
1335  } else {
1336  /* this serialized team is already being used,
1337  * that's fine, just add another nested level */
1338  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1339  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1340  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1341  ++serial_team->t.t_serialized;
1342  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1343 
1344  // Nested level will be an index in the nested nthreads array
1345  int level = this_thr->th.th_team->t.t_level;
1346  // Thread value exists in the nested nthreads array for the next nested
1347  // level
1348  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1349  this_thr->th.th_current_task->td_icvs.nproc =
1350  __kmp_nested_nth.nth[level + 1];
1351  }
1352  serial_team->t.t_level++;
1353  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1354  "of serial team %p to %d\n",
1355  global_tid, serial_team, serial_team->t.t_level));
1356 
1357  /* allocate/push dispatch buffers stack */
1358  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1359  {
1360  dispatch_private_info_t *disp_buffer =
1361  (dispatch_private_info_t *)__kmp_allocate(
1362  sizeof(dispatch_private_info_t));
1363  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1364  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1365  }
1366  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1367 
1368  KMP_MB();
1369  }
1370 #if OMP_40_ENABLED
1371  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1372 #endif
1373 
1374  if (__kmp_env_consistency_check)
1375  __kmp_push_parallel(global_tid, NULL);
1376 #if OMPT_SUPPORT
1377  serial_team->t.ompt_team_info.master_return_address = codeptr;
1378  if (ompt_enabled.enabled &&
1379  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1380  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1381 
1382  ompt_lw_taskteam_t lw_taskteam;
1383  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1384  &ompt_parallel_data, codeptr);
1385 
1386  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1387  // don't use lw_taskteam after linking. content was swaped
1388 
1389  /* OMPT implicit task begin */
1390  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1391  if (ompt_enabled.ompt_callback_implicit_task) {
1392  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1393  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1394  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1395  OMPT_CUR_TASK_INFO(this_thr)
1396  ->thread_num = __kmp_tid_from_gtid(global_tid);
1397  }
1398 
1399  /* OMPT state */
1400  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1401  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1402  }
1403 #endif
1404 }
1405 
1406 /* most of the work for a fork */
1407 /* return true if we really went parallel, false if serialized */
1408 int __kmp_fork_call(ident_t *loc, int gtid,
1409  enum fork_context_e call_context, // Intel, GNU, ...
1410  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1411 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1412 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1413  va_list *ap
1414 #else
1415  va_list ap
1416 #endif
1417  ) {
1418  void **argv;
1419  int i;
1420  int master_tid;
1421  int master_this_cons;
1422  kmp_team_t *team;
1423  kmp_team_t *parent_team;
1424  kmp_info_t *master_th;
1425  kmp_root_t *root;
1426  int nthreads;
1427  int master_active;
1428  int master_set_numthreads;
1429  int level;
1430 #if OMP_40_ENABLED
1431  int active_level;
1432  int teams_level;
1433 #endif
1434 #if KMP_NESTED_HOT_TEAMS
1435  kmp_hot_team_ptr_t **p_hot_teams;
1436 #endif
1437  { // KMP_TIME_BLOCK
1438  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1439  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1440 
1441  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1442  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1443  /* Some systems prefer the stack for the root thread(s) to start with */
1444  /* some gap from the parent stack to prevent false sharing. */
1445  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1446  /* These 2 lines below are so this does not get optimized out */
1447  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1448  __kmp_stkpadding += (short)((kmp_int64)dummy);
1449  }
1450 
1451  /* initialize if needed */
1452  KMP_DEBUG_ASSERT(
1453  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1454  if (!TCR_4(__kmp_init_parallel))
1455  __kmp_parallel_initialize();
1456 
1457  /* setup current data */
1458  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1459  // shutdown
1460  parent_team = master_th->th.th_team;
1461  master_tid = master_th->th.th_info.ds.ds_tid;
1462  master_this_cons = master_th->th.th_local.this_construct;
1463  root = master_th->th.th_root;
1464  master_active = root->r.r_active;
1465  master_set_numthreads = master_th->th.th_set_nproc;
1466 
1467 #if OMPT_SUPPORT
1468  ompt_data_t ompt_parallel_data;
1469  ompt_parallel_data.ptr = NULL;
1470  ompt_data_t *parent_task_data;
1471  omp_frame_t *ompt_frame;
1472  ompt_data_t *implicit_task_data;
1473  void *return_address = NULL;
1474 
1475  if (ompt_enabled.enabled) {
1476  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1477  NULL, NULL);
1478  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1479  }
1480 #endif
1481 
1482  // Nested level will be an index in the nested nthreads array
1483  level = parent_team->t.t_level;
1484  // used to launch non-serial teams even if nested is not allowed
1485  active_level = parent_team->t.t_active_level;
1486 #if OMP_40_ENABLED
1487  // needed to check nesting inside the teams
1488  teams_level = master_th->th.th_teams_level;
1489 #endif
1490 #if KMP_NESTED_HOT_TEAMS
1491  p_hot_teams = &master_th->th.th_hot_teams;
1492  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1493  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1494  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1495  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1496  // it is either actual or not needed (when active_level > 0)
1497  (*p_hot_teams)[0].hot_team_nth = 1;
1498  }
1499 #endif
1500 
1501 #if OMPT_SUPPORT
1502  if (ompt_enabled.enabled) {
1503  if (ompt_enabled.ompt_callback_parallel_begin) {
1504  int team_size = master_set_numthreads
1505  ? master_set_numthreads
1506  : get__nproc_2(parent_team, master_tid);
1507  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1508  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1509  OMPT_INVOKER(call_context), return_address);
1510  }
1511  master_th->th.ompt_thread_info.state = omp_state_overhead;
1512  }
1513 #endif
1514 
1515  master_th->th.th_ident = loc;
1516 
1517 #if OMP_40_ENABLED
1518  if (master_th->th.th_teams_microtask && ap &&
1519  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1520  // AC: This is start of parallel that is nested inside teams construct.
1521  // The team is actual (hot), all workers are ready at the fork barrier.
1522  // No lock needed to initialize the team a bit, then free workers.
1523  parent_team->t.t_ident = loc;
1524  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1525  parent_team->t.t_argc = argc;
1526  argv = (void **)parent_team->t.t_argv;
1527  for (i = argc - 1; i >= 0; --i)
1528 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1529 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1530  *argv++ = va_arg(*ap, void *);
1531 #else
1532  *argv++ = va_arg(ap, void *);
1533 #endif
1534  // Increment our nested depth levels, but not increase the serialization
1535  if (parent_team == master_th->th.th_serial_team) {
1536  // AC: we are in serialized parallel
1537  __kmpc_serialized_parallel(loc, gtid);
1538  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1539  // AC: need this in order enquiry functions work
1540  // correctly, will restore at join time
1541  parent_team->t.t_serialized--;
1542 #if OMPT_SUPPORT
1543  void *dummy;
1544  void **exit_runtime_p;
1545 
1546  ompt_lw_taskteam_t lw_taskteam;
1547 
1548  if (ompt_enabled.enabled) {
1549  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1550  &ompt_parallel_data, return_address);
1551  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1552 
1553  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1554  // don't use lw_taskteam after linking. content was swaped
1555 
1556  /* OMPT implicit task begin */
1557  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1558  if (ompt_enabled.ompt_callback_implicit_task) {
1559  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1560  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1561  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1562  OMPT_CUR_TASK_INFO(master_th)
1563  ->thread_num = __kmp_tid_from_gtid(gtid);
1564  }
1565 
1566  /* OMPT state */
1567  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1568  } else {
1569  exit_runtime_p = &dummy;
1570  }
1571 #endif
1572 
1573  {
1574  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1575  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1576  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1577 #if OMPT_SUPPORT
1578  ,
1579  exit_runtime_p
1580 #endif
1581  );
1582  }
1583 
1584 #if OMPT_SUPPORT
1585  *exit_runtime_p = NULL;
1586  if (ompt_enabled.enabled) {
1587  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1588  if (ompt_enabled.ompt_callback_implicit_task) {
1589  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1590  ompt_scope_end, NULL, implicit_task_data, 1,
1591  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1592  }
1593  __ompt_lw_taskteam_unlink(master_th);
1594 
1595  if (ompt_enabled.ompt_callback_parallel_end) {
1596  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1597  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1598  OMPT_INVOKER(call_context), return_address);
1599  }
1600  master_th->th.ompt_thread_info.state = omp_state_overhead;
1601  }
1602 #endif
1603  return TRUE;
1604  }
1605 
1606  parent_team->t.t_pkfn = microtask;
1607  parent_team->t.t_invoke = invoker;
1608  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1609  parent_team->t.t_active_level++;
1610  parent_team->t.t_level++;
1611 
1612  /* Change number of threads in the team if requested */
1613  if (master_set_numthreads) { // The parallel has num_threads clause
1614  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1615  // AC: only can reduce number of threads dynamically, can't increase
1616  kmp_info_t **other_threads = parent_team->t.t_threads;
1617  parent_team->t.t_nproc = master_set_numthreads;
1618  for (i = 0; i < master_set_numthreads; ++i) {
1619  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1620  }
1621  // Keep extra threads hot in the team for possible next parallels
1622  }
1623  master_th->th.th_set_nproc = 0;
1624  }
1625 
1626 #if USE_DEBUGGER
1627  if (__kmp_debugging) { // Let debugger override number of threads.
1628  int nth = __kmp_omp_num_threads(loc);
1629  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1630  master_set_numthreads = nth;
1631  }
1632  }
1633 #endif
1634 
1635  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1636  "master_th=%p, gtid=%d\n",
1637  root, parent_team, master_th, gtid));
1638  __kmp_internal_fork(loc, gtid, parent_team);
1639  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1640  "master_th=%p, gtid=%d\n",
1641  root, parent_team, master_th, gtid));
1642 
1643  /* Invoke microtask for MASTER thread */
1644  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1645  parent_team->t.t_id, parent_team->t.t_pkfn));
1646 
1647  if (!parent_team->t.t_invoke(gtid)) {
1648  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1649  }
1650  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1651  parent_team->t.t_id, parent_team->t.t_pkfn));
1652  KMP_MB(); /* Flush all pending memory write invalidates. */
1653 
1654  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1655 
1656  return TRUE;
1657  } // Parallel closely nested in teams construct
1658 #endif /* OMP_40_ENABLED */
1659 
1660 #if KMP_DEBUG
1661  if (__kmp_tasking_mode != tskm_immediate_exec) {
1662  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1663  parent_team->t.t_task_team[master_th->th.th_task_state]);
1664  }
1665 #endif
1666 
1667  if (parent_team->t.t_active_level >=
1668  master_th->th.th_current_task->td_icvs.max_active_levels) {
1669  nthreads = 1;
1670  } else {
1671 #if OMP_40_ENABLED
1672  int enter_teams = ((ap == NULL && active_level == 0) ||
1673  (ap && teams_level > 0 && teams_level == level));
1674 #endif
1675  nthreads =
1676  master_set_numthreads
1677  ? master_set_numthreads
1678  : get__nproc_2(
1679  parent_team,
1680  master_tid); // TODO: get nproc directly from current task
1681 
1682  // Check if we need to take forkjoin lock? (no need for serialized
1683  // parallel out of teams construct). This code moved here from
1684  // __kmp_reserve_threads() to speedup nested serialized parallels.
1685  if (nthreads > 1) {
1686  if ((!get__nested(master_th) && (root->r.r_in_parallel
1687 #if OMP_40_ENABLED
1688  && !enter_teams
1689 #endif /* OMP_40_ENABLED */
1690  )) ||
1691  (__kmp_library == library_serial)) {
1692  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1693  " threads\n",
1694  gtid, nthreads));
1695  nthreads = 1;
1696  }
1697  }
1698  if (nthreads > 1) {
1699  /* determine how many new threads we can use */
1700  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1701  nthreads = __kmp_reserve_threads(
1702  root, parent_team, master_tid, nthreads
1703 #if OMP_40_ENABLED
1704  /* AC: If we execute teams from parallel region (on host), then
1705  teams should be created but each can only have 1 thread if
1706  nesting is disabled. If teams called from serial region, then
1707  teams and their threads should be created regardless of the
1708  nesting setting. */
1709  ,
1710  enter_teams
1711 #endif /* OMP_40_ENABLED */
1712  );
1713  if (nthreads == 1) {
1714  // Free lock for single thread execution here; for multi-thread
1715  // execution it will be freed later after team of threads created
1716  // and initialized
1717  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1718  }
1719  }
1720  }
1721  KMP_DEBUG_ASSERT(nthreads > 0);
1722 
1723  // If we temporarily changed the set number of threads then restore it now
1724  master_th->th.th_set_nproc = 0;
1725 
1726  /* create a serialized parallel region? */
1727  if (nthreads == 1) {
1728 /* josh todo: hypothetical question: what do we do for OS X*? */
1729 #if KMP_OS_LINUX && \
1730  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1731  void *args[argc];
1732 #else
1733  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1734 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1735  KMP_ARCH_AARCH64) */
1736 
1737  KA_TRACE(20,
1738  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1739 
1740  __kmpc_serialized_parallel(loc, gtid);
1741 
1742  if (call_context == fork_context_intel) {
1743  /* TODO this sucks, use the compiler itself to pass args! :) */
1744  master_th->th.th_serial_team->t.t_ident = loc;
1745 #if OMP_40_ENABLED
1746  if (!ap) {
1747  // revert change made in __kmpc_serialized_parallel()
1748  master_th->th.th_serial_team->t.t_level--;
1749 // Get args from parent team for teams construct
1750 
1751 #if OMPT_SUPPORT
1752  void *dummy;
1753  void **exit_runtime_p;
1754  ompt_task_info_t *task_info;
1755 
1756  ompt_lw_taskteam_t lw_taskteam;
1757 
1758  if (ompt_enabled.enabled) {
1759  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1760  &ompt_parallel_data, return_address);
1761 
1762  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1763  // don't use lw_taskteam after linking. content was swaped
1764 
1765  task_info = OMPT_CUR_TASK_INFO(master_th);
1766  exit_runtime_p = &(task_info->frame.exit_frame);
1767  if (ompt_enabled.ompt_callback_implicit_task) {
1768  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1769  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1770  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1771  OMPT_CUR_TASK_INFO(master_th)
1772  ->thread_num = __kmp_tid_from_gtid(gtid);
1773  }
1774 
1775  /* OMPT state */
1776  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1777  } else {
1778  exit_runtime_p = &dummy;
1779  }
1780 #endif
1781 
1782  {
1783  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1784  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1785  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1786  parent_team->t.t_argv
1787 #if OMPT_SUPPORT
1788  ,
1789  exit_runtime_p
1790 #endif
1791  );
1792  }
1793 
1794 #if OMPT_SUPPORT
1795  if (ompt_enabled.enabled) {
1796  exit_runtime_p = NULL;
1797  if (ompt_enabled.ompt_callback_implicit_task) {
1798  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1799  ompt_scope_end, NULL, &(task_info->task_data), 1,
1800  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1801  }
1802 
1803  __ompt_lw_taskteam_unlink(master_th);
1804  if (ompt_enabled.ompt_callback_parallel_end) {
1805  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1806  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1807  OMPT_INVOKER(call_context), return_address);
1808  }
1809  master_th->th.ompt_thread_info.state = omp_state_overhead;
1810  }
1811 #endif
1812  } else if (microtask == (microtask_t)__kmp_teams_master) {
1813  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1814  master_th->th.th_serial_team);
1815  team = master_th->th.th_team;
1816  // team->t.t_pkfn = microtask;
1817  team->t.t_invoke = invoker;
1818  __kmp_alloc_argv_entries(argc, team, TRUE);
1819  team->t.t_argc = argc;
1820  argv = (void **)team->t.t_argv;
1821  if (ap) {
1822  for (i = argc - 1; i >= 0; --i)
1823 // TODO: revert workaround for Intel(R) 64 tracker #96
1824 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1825  *argv++ = va_arg(*ap, void *);
1826 #else
1827  *argv++ = va_arg(ap, void *);
1828 #endif
1829  } else {
1830  for (i = 0; i < argc; ++i)
1831  // Get args from parent team for teams construct
1832  argv[i] = parent_team->t.t_argv[i];
1833  }
1834  // AC: revert change made in __kmpc_serialized_parallel()
1835  // because initial code in teams should have level=0
1836  team->t.t_level--;
1837  // AC: call special invoker for outer "parallel" of teams construct
1838  invoker(gtid);
1839  } else {
1840 #endif /* OMP_40_ENABLED */
1841  argv = args;
1842  for (i = argc - 1; i >= 0; --i)
1843 // TODO: revert workaround for Intel(R) 64 tracker #96
1844 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1845  *argv++ = va_arg(*ap, void *);
1846 #else
1847  *argv++ = va_arg(ap, void *);
1848 #endif
1849  KMP_MB();
1850 
1851 #if OMPT_SUPPORT
1852  void *dummy;
1853  void **exit_runtime_p;
1854  ompt_task_info_t *task_info;
1855 
1856  ompt_lw_taskteam_t lw_taskteam;
1857 
1858  if (ompt_enabled.enabled) {
1859  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1860  &ompt_parallel_data, return_address);
1861  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1862  // don't use lw_taskteam after linking. content was swaped
1863  task_info = OMPT_CUR_TASK_INFO(master_th);
1864  exit_runtime_p = &(task_info->frame.exit_frame);
1865 
1866  /* OMPT implicit task begin */
1867  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1868  if (ompt_enabled.ompt_callback_implicit_task) {
1869  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1870  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1871  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1872  OMPT_CUR_TASK_INFO(master_th)
1873  ->thread_num = __kmp_tid_from_gtid(gtid);
1874  }
1875 
1876  /* OMPT state */
1877  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1878  } else {
1879  exit_runtime_p = &dummy;
1880  }
1881 #endif
1882 
1883  {
1884  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1885  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1886  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1887 #if OMPT_SUPPORT
1888  ,
1889  exit_runtime_p
1890 #endif
1891  );
1892  }
1893 
1894 #if OMPT_SUPPORT
1895  if (ompt_enabled.enabled) {
1896  *exit_runtime_p = NULL;
1897  if (ompt_enabled.ompt_callback_implicit_task) {
1898  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1899  ompt_scope_end, NULL, &(task_info->task_data), 1,
1900  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1901  }
1902 
1903  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1904  __ompt_lw_taskteam_unlink(master_th);
1905  if (ompt_enabled.ompt_callback_parallel_end) {
1906  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1907  &ompt_parallel_data, parent_task_data,
1908  OMPT_INVOKER(call_context), return_address);
1909  }
1910  master_th->th.ompt_thread_info.state = omp_state_overhead;
1911  }
1912 #endif
1913 #if OMP_40_ENABLED
1914  }
1915 #endif /* OMP_40_ENABLED */
1916  } else if (call_context == fork_context_gnu) {
1917 #if OMPT_SUPPORT
1918  ompt_lw_taskteam_t lwt;
1919  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1920  return_address);
1921 
1922  lwt.ompt_task_info.frame.exit_frame = NULL;
1923  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1924 // don't use lw_taskteam after linking. content was swaped
1925 #endif
1926 
1927  // we were called from GNU native code
1928  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1929  return FALSE;
1930  } else {
1931  KMP_ASSERT2(call_context < fork_context_last,
1932  "__kmp_fork_call: unknown fork_context parameter");
1933  }
1934 
1935  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1936  KMP_MB();
1937  return FALSE;
1938  }
1939 
1940  // GEH: only modify the executing flag in the case when not serialized
1941  // serialized case is handled in kmpc_serialized_parallel
1942  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1943  "curtask=%p, curtask_max_aclevel=%d\n",
1944  parent_team->t.t_active_level, master_th,
1945  master_th->th.th_current_task,
1946  master_th->th.th_current_task->td_icvs.max_active_levels));
1947  // TODO: GEH - cannot do this assertion because root thread not set up as
1948  // executing
1949  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1950  master_th->th.th_current_task->td_flags.executing = 0;
1951 
1952 #if OMP_40_ENABLED
1953  if (!master_th->th.th_teams_microtask || level > teams_level)
1954 #endif /* OMP_40_ENABLED */
1955  {
1956  /* Increment our nested depth level */
1957  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1958  }
1959 
1960  // See if we need to make a copy of the ICVs.
1961  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1962  if ((level + 1 < __kmp_nested_nth.used) &&
1963  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1964  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1965  } else {
1966  nthreads_icv = 0; // don't update
1967  }
1968 
1969 #if OMP_40_ENABLED
1970  // Figure out the proc_bind_policy for the new team.
1971  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1972  kmp_proc_bind_t proc_bind_icv =
1973  proc_bind_default; // proc_bind_default means don't update
1974  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1975  proc_bind = proc_bind_false;
1976  } else {
1977  if (proc_bind == proc_bind_default) {
1978  // No proc_bind clause specified; use current proc-bind-var for this
1979  // parallel region
1980  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1981  }
1982  /* else: The proc_bind policy was specified explicitly on parallel clause.
1983  This overrides proc-bind-var for this parallel region, but does not
1984  change proc-bind-var. */
1985  // Figure the value of proc-bind-var for the child threads.
1986  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1987  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1988  master_th->th.th_current_task->td_icvs.proc_bind)) {
1989  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1990  }
1991  }
1992 
1993  // Reset for next parallel region
1994  master_th->th.th_set_proc_bind = proc_bind_default;
1995 #endif /* OMP_40_ENABLED */
1996 
1997  if ((nthreads_icv > 0)
1998 #if OMP_40_ENABLED
1999  || (proc_bind_icv != proc_bind_default)
2000 #endif /* OMP_40_ENABLED */
2001  ) {
2002  kmp_internal_control_t new_icvs;
2003  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2004  new_icvs.next = NULL;
2005  if (nthreads_icv > 0) {
2006  new_icvs.nproc = nthreads_icv;
2007  }
2008 
2009 #if OMP_40_ENABLED
2010  if (proc_bind_icv != proc_bind_default) {
2011  new_icvs.proc_bind = proc_bind_icv;
2012  }
2013 #endif /* OMP_40_ENABLED */
2014 
2015  /* allocate a new parallel team */
2016  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2017  team = __kmp_allocate_team(root, nthreads, nthreads,
2018 #if OMPT_SUPPORT
2019  ompt_parallel_data,
2020 #endif
2021 #if OMP_40_ENABLED
2022  proc_bind,
2023 #endif
2024  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2025  } else {
2026  /* allocate a new parallel team */
2027  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2028  team = __kmp_allocate_team(root, nthreads, nthreads,
2029 #if OMPT_SUPPORT
2030  ompt_parallel_data,
2031 #endif
2032 #if OMP_40_ENABLED
2033  proc_bind,
2034 #endif
2035  &master_th->th.th_current_task->td_icvs,
2036  argc USE_NESTED_HOT_ARG(master_th));
2037  }
2038  KF_TRACE(
2039  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2040 
2041  /* setup the new team */
2042  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2043  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2044  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2045  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2046  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2047 #if OMPT_SUPPORT
2048  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2049  return_address);
2050 #endif
2051  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2052 // TODO: parent_team->t.t_level == INT_MAX ???
2053 #if OMP_40_ENABLED
2054  if (!master_th->th.th_teams_microtask || level > teams_level) {
2055 #endif /* OMP_40_ENABLED */
2056  int new_level = parent_team->t.t_level + 1;
2057  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2058  new_level = parent_team->t.t_active_level + 1;
2059  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2060 #if OMP_40_ENABLED
2061  } else {
2062  // AC: Do not increase parallel level at start of the teams construct
2063  int new_level = parent_team->t.t_level;
2064  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2065  new_level = parent_team->t.t_active_level;
2066  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2067  }
2068 #endif /* OMP_40_ENABLED */
2069  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2070  // set master's schedule as new run-time schedule
2071  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2072 
2073 #if OMP_40_ENABLED
2074  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2075 #endif
2076 
2077  // Update the floating point rounding in the team if required.
2078  propagateFPControl(team);
2079 
2080  if (__kmp_tasking_mode != tskm_immediate_exec) {
2081  // Set master's task team to team's task team. Unless this is hot team, it
2082  // should be NULL.
2083  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2084  parent_team->t.t_task_team[master_th->th.th_task_state]);
2085  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2086  "%p, new task_team %p / team %p\n",
2087  __kmp_gtid_from_thread(master_th),
2088  master_th->th.th_task_team, parent_team,
2089  team->t.t_task_team[master_th->th.th_task_state], team));
2090 
2091  if (active_level || master_th->th.th_task_team) {
2092  // Take a memo of master's task_state
2093  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2094  if (master_th->th.th_task_state_top >=
2095  master_th->th.th_task_state_stack_sz) { // increase size
2096  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2097  kmp_uint8 *old_stack, *new_stack;
2098  kmp_uint32 i;
2099  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2100  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2101  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2102  }
2103  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2104  ++i) { // zero-init rest of stack
2105  new_stack[i] = 0;
2106  }
2107  old_stack = master_th->th.th_task_state_memo_stack;
2108  master_th->th.th_task_state_memo_stack = new_stack;
2109  master_th->th.th_task_state_stack_sz = new_size;
2110  __kmp_free(old_stack);
2111  }
2112  // Store master's task_state on stack
2113  master_th->th
2114  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2115  master_th->th.th_task_state;
2116  master_th->th.th_task_state_top++;
2117 #if KMP_NESTED_HOT_TEAMS
2118  if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2119  // Restore master's nested state if nested hot team
2120  master_th->th.th_task_state =
2121  master_th->th
2122  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2123  } else {
2124 #endif
2125  master_th->th.th_task_state = 0;
2126 #if KMP_NESTED_HOT_TEAMS
2127  }
2128 #endif
2129  }
2130 #if !KMP_NESTED_HOT_TEAMS
2131  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2132  (team == root->r.r_hot_team));
2133 #endif
2134  }
2135 
2136  KA_TRACE(
2137  20,
2138  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2139  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2140  team->t.t_nproc));
2141  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2142  (team->t.t_master_tid == 0 &&
2143  (team->t.t_parent == root->r.r_root_team ||
2144  team->t.t_parent->t.t_serialized)));
2145  KMP_MB();
2146 
2147  /* now, setup the arguments */
2148  argv = (void **)team->t.t_argv;
2149 #if OMP_40_ENABLED
2150  if (ap) {
2151 #endif /* OMP_40_ENABLED */
2152  for (i = argc - 1; i >= 0; --i) {
2153 // TODO: revert workaround for Intel(R) 64 tracker #96
2154 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2155  void *new_argv = va_arg(*ap, void *);
2156 #else
2157  void *new_argv = va_arg(ap, void *);
2158 #endif
2159  KMP_CHECK_UPDATE(*argv, new_argv);
2160  argv++;
2161  }
2162 #if OMP_40_ENABLED
2163  } else {
2164  for (i = 0; i < argc; ++i) {
2165  // Get args from parent team for teams construct
2166  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2167  }
2168  }
2169 #endif /* OMP_40_ENABLED */
2170 
2171  /* now actually fork the threads */
2172  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2173  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2174  root->r.r_active = TRUE;
2175 
2176  __kmp_fork_team_threads(root, team, master_th, gtid);
2177  __kmp_setup_icv_copy(team, nthreads,
2178  &master_th->th.th_current_task->td_icvs, loc);
2179 
2180 #if OMPT_SUPPORT
2181  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2182 #endif
2183 
2184  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2185 
2186 #if USE_ITT_BUILD
2187  if (team->t.t_active_level == 1 // only report frames at level 1
2188 #if OMP_40_ENABLED
2189  && !master_th->th.th_teams_microtask // not in teams construct
2190 #endif /* OMP_40_ENABLED */
2191  ) {
2192 #if USE_ITT_NOTIFY
2193  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2194  (__kmp_forkjoin_frames_mode == 3 ||
2195  __kmp_forkjoin_frames_mode == 1)) {
2196  kmp_uint64 tmp_time = 0;
2197  if (__itt_get_timestamp_ptr)
2198  tmp_time = __itt_get_timestamp();
2199  // Internal fork - report frame begin
2200  master_th->th.th_frame_time = tmp_time;
2201  if (__kmp_forkjoin_frames_mode == 3)
2202  team->t.t_region_time = tmp_time;
2203  } else
2204 // only one notification scheme (either "submit" or "forking/joined", not both)
2205 #endif /* USE_ITT_NOTIFY */
2206  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2207  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2208  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2209  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2210  }
2211  }
2212 #endif /* USE_ITT_BUILD */
2213 
2214  /* now go on and do the work */
2215  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2216  KMP_MB();
2217  KF_TRACE(10,
2218  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2219  root, team, master_th, gtid));
2220 
2221 #if USE_ITT_BUILD
2222  if (__itt_stack_caller_create_ptr) {
2223  team->t.t_stack_id =
2224  __kmp_itt_stack_caller_create(); // create new stack stitching id
2225  // before entering fork barrier
2226  }
2227 #endif /* USE_ITT_BUILD */
2228 
2229 #if OMP_40_ENABLED
2230  // AC: skip __kmp_internal_fork at teams construct, let only master
2231  // threads execute
2232  if (ap)
2233 #endif /* OMP_40_ENABLED */
2234  {
2235  __kmp_internal_fork(loc, gtid, team);
2236  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2237  "master_th=%p, gtid=%d\n",
2238  root, team, master_th, gtid));
2239  }
2240 
2241  if (call_context == fork_context_gnu) {
2242  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2243  return TRUE;
2244  }
2245 
2246  /* Invoke microtask for MASTER thread */
2247  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2248  team->t.t_id, team->t.t_pkfn));
2249  } // END of timer KMP_fork_call block
2250 
2251  if (!team->t.t_invoke(gtid)) {
2252  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2253  }
2254  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2255  team->t.t_id, team->t.t_pkfn));
2256  KMP_MB(); /* Flush all pending memory write invalidates. */
2257 
2258  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2259 
2260 #if OMPT_SUPPORT
2261  if (ompt_enabled.enabled) {
2262  master_th->th.ompt_thread_info.state = omp_state_overhead;
2263  }
2264 #endif
2265 
2266  return TRUE;
2267 }
2268 
2269 #if OMPT_SUPPORT
2270 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2271  kmp_team_t *team) {
2272  // restore state outside the region
2273  thread->th.ompt_thread_info.state =
2274  ((team->t.t_serialized) ? omp_state_work_serial
2275  : omp_state_work_parallel);
2276 }
2277 
2278 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2279  kmp_team_t *team, ompt_data_t *parallel_data,
2280  fork_context_e fork_context, void *codeptr) {
2281  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2282  if (ompt_enabled.ompt_callback_parallel_end) {
2283  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2284  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2285  codeptr);
2286  }
2287 
2288  task_info->frame.enter_frame = NULL;
2289  __kmp_join_restore_state(thread, team);
2290 }
2291 #endif
2292 
2293 void __kmp_join_call(ident_t *loc, int gtid
2294 #if OMPT_SUPPORT
2295  ,
2296  enum fork_context_e fork_context
2297 #endif
2298 #if OMP_40_ENABLED
2299  ,
2300  int exit_teams
2301 #endif /* OMP_40_ENABLED */
2302  ) {
2303  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2304  kmp_team_t *team;
2305  kmp_team_t *parent_team;
2306  kmp_info_t *master_th;
2307  kmp_root_t *root;
2308  int master_active;
2309  int i;
2310 
2311  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2312 
2313  /* setup current data */
2314  master_th = __kmp_threads[gtid];
2315  root = master_th->th.th_root;
2316  team = master_th->th.th_team;
2317  parent_team = team->t.t_parent;
2318 
2319  master_th->th.th_ident = loc;
2320 
2321 #if OMPT_SUPPORT
2322  if (ompt_enabled.enabled) {
2323  master_th->th.ompt_thread_info.state = omp_state_overhead;
2324  }
2325 #endif
2326 
2327 #if KMP_DEBUG
2328  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2329  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2330  "th_task_team = %p\n",
2331  __kmp_gtid_from_thread(master_th), team,
2332  team->t.t_task_team[master_th->th.th_task_state],
2333  master_th->th.th_task_team));
2334  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2335  team->t.t_task_team[master_th->th.th_task_state]);
2336  }
2337 #endif
2338 
2339  if (team->t.t_serialized) {
2340 #if OMP_40_ENABLED
2341  if (master_th->th.th_teams_microtask) {
2342  // We are in teams construct
2343  int level = team->t.t_level;
2344  int tlevel = master_th->th.th_teams_level;
2345  if (level == tlevel) {
2346  // AC: we haven't incremented it earlier at start of teams construct,
2347  // so do it here - at the end of teams construct
2348  team->t.t_level++;
2349  } else if (level == tlevel + 1) {
2350  // AC: we are exiting parallel inside teams, need to increment
2351  // serialization in order to restore it in the next call to
2352  // __kmpc_end_serialized_parallel
2353  team->t.t_serialized++;
2354  }
2355  }
2356 #endif /* OMP_40_ENABLED */
2357  __kmpc_end_serialized_parallel(loc, gtid);
2358 
2359 #if OMPT_SUPPORT
2360  if (ompt_enabled.enabled) {
2361  __kmp_join_restore_state(master_th, parent_team);
2362  }
2363 #endif
2364 
2365  return;
2366  }
2367 
2368  master_active = team->t.t_master_active;
2369 
2370 #if OMP_40_ENABLED
2371  if (!exit_teams)
2372 #endif /* OMP_40_ENABLED */
2373  {
2374  // AC: No barrier for internal teams at exit from teams construct.
2375  // But there is barrier for external team (league).
2376  __kmp_internal_join(loc, gtid, team);
2377  }
2378 #if OMP_40_ENABLED
2379  else {
2380  master_th->th.th_task_state =
2381  0; // AC: no tasking in teams (out of any parallel)
2382  }
2383 #endif /* OMP_40_ENABLED */
2384 
2385  KMP_MB();
2386 
2387 #if OMPT_SUPPORT
2388  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2389  void *codeptr = team->t.ompt_team_info.master_return_address;
2390 #endif
2391 
2392 #if USE_ITT_BUILD
2393  if (__itt_stack_caller_create_ptr) {
2394  __kmp_itt_stack_caller_destroy(
2395  (__itt_caller)team->t
2396  .t_stack_id); // destroy the stack stitching id after join barrier
2397  }
2398 
2399  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2400  if (team->t.t_active_level == 1
2401 #if OMP_40_ENABLED
2402  && !master_th->th.th_teams_microtask /* not in teams construct */
2403 #endif /* OMP_40_ENABLED */
2404  ) {
2405  master_th->th.th_ident = loc;
2406  // only one notification scheme (either "submit" or "forking/joined", not
2407  // both)
2408  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2409  __kmp_forkjoin_frames_mode == 3)
2410  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2411  master_th->th.th_frame_time, 0, loc,
2412  master_th->th.th_team_nproc, 1);
2413  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2414  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2415  __kmp_itt_region_joined(gtid);
2416  } // active_level == 1
2417 #endif /* USE_ITT_BUILD */
2418 
2419 #if OMP_40_ENABLED
2420  if (master_th->th.th_teams_microtask && !exit_teams &&
2421  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2422  team->t.t_level == master_th->th.th_teams_level + 1) {
2423  // AC: We need to leave the team structure intact at the end of parallel
2424  // inside the teams construct, so that at the next parallel same (hot) team
2425  // works, only adjust nesting levels
2426 
2427  /* Decrement our nested depth level */
2428  team->t.t_level--;
2429  team->t.t_active_level--;
2430  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2431 
2432  /* Restore number of threads in the team if needed */
2433  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2434  int old_num = master_th->th.th_team_nproc;
2435  int new_num = master_th->th.th_teams_size.nth;
2436  kmp_info_t **other_threads = team->t.t_threads;
2437  team->t.t_nproc = new_num;
2438  for (i = 0; i < old_num; ++i) {
2439  other_threads[i]->th.th_team_nproc = new_num;
2440  }
2441  // Adjust states of non-used threads of the team
2442  for (i = old_num; i < new_num; ++i) {
2443  // Re-initialize thread's barrier data.
2444  int b;
2445  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2446  for (b = 0; b < bs_last_barrier; ++b) {
2447  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2448  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2449 #if USE_DEBUGGER
2450  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2451 #endif
2452  }
2453  if (__kmp_tasking_mode != tskm_immediate_exec) {
2454  // Synchronize thread's task state
2455  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2456  }
2457  }
2458  }
2459 
2460 #if OMPT_SUPPORT
2461  if (ompt_enabled.enabled) {
2462  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2463  codeptr);
2464  }
2465 #endif
2466 
2467  return;
2468  }
2469 #endif /* OMP_40_ENABLED */
2470 
2471  /* do cleanup and restore the parent team */
2472  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2473  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2474 
2475  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2476 
2477  /* jc: The following lock has instructions with REL and ACQ semantics,
2478  separating the parallel user code called in this parallel region
2479  from the serial user code called after this function returns. */
2480  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2481 
2482 #if OMP_40_ENABLED
2483  if (!master_th->th.th_teams_microtask ||
2484  team->t.t_level > master_th->th.th_teams_level)
2485 #endif /* OMP_40_ENABLED */
2486  {
2487  /* Decrement our nested depth level */
2488  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2489  }
2490  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2491 
2492 #if OMPT_SUPPORT
2493  if (ompt_enabled.enabled) {
2494  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2495  if (ompt_enabled.ompt_callback_implicit_task) {
2496  int ompt_team_size = team->t.t_nproc;
2497  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2498  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2499  OMPT_CUR_TASK_INFO(master_th)->thread_num);
2500  }
2501 
2502  task_info->frame.exit_frame = NULL;
2503  task_info->task_data = ompt_data_none;
2504  }
2505 #endif
2506 
2507  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2508  master_th, team));
2509  __kmp_pop_current_task_from_thread(master_th);
2510 
2511 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2512  // Restore master thread's partition.
2513  master_th->th.th_first_place = team->t.t_first_place;
2514  master_th->th.th_last_place = team->t.t_last_place;
2515 #endif /* OMP_40_ENABLED */
2516 
2517  updateHWFPControl(team);
2518 
2519  if (root->r.r_active != master_active)
2520  root->r.r_active = master_active;
2521 
2522  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2523  master_th)); // this will free worker threads
2524 
2525  /* this race was fun to find. make sure the following is in the critical
2526  region otherwise assertions may fail occasionally since the old team may be
2527  reallocated and the hierarchy appears inconsistent. it is actually safe to
2528  run and won't cause any bugs, but will cause those assertion failures. it's
2529  only one deref&assign so might as well put this in the critical region */
2530  master_th->th.th_team = parent_team;
2531  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2532  master_th->th.th_team_master = parent_team->t.t_threads[0];
2533  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2534 
2535  /* restore serialized team, if need be */
2536  if (parent_team->t.t_serialized &&
2537  parent_team != master_th->th.th_serial_team &&
2538  parent_team != root->r.r_root_team) {
2539  __kmp_free_team(root,
2540  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2541  master_th->th.th_serial_team = parent_team;
2542  }
2543 
2544  if (__kmp_tasking_mode != tskm_immediate_exec) {
2545  if (master_th->th.th_task_state_top >
2546  0) { // Restore task state from memo stack
2547  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2548  // Remember master's state if we re-use this nested hot team
2549  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2550  master_th->th.th_task_state;
2551  --master_th->th.th_task_state_top; // pop
2552  // Now restore state at this level
2553  master_th->th.th_task_state =
2554  master_th->th
2555  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2556  }
2557  // Copy the task team from the parent team to the master thread
2558  master_th->th.th_task_team =
2559  parent_team->t.t_task_team[master_th->th.th_task_state];
2560  KA_TRACE(20,
2561  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2562  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2563  parent_team));
2564  }
2565 
2566  // TODO: GEH - cannot do this assertion because root thread not set up as
2567  // executing
2568  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2569  master_th->th.th_current_task->td_flags.executing = 1;
2570 
2571  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2572 
2573 #if OMPT_SUPPORT
2574  if (ompt_enabled.enabled) {
2575  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2576  codeptr);
2577  }
2578 #endif
2579 
2580  KMP_MB();
2581  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2582 }
2583 
2584 /* Check whether we should push an internal control record onto the
2585  serial team stack. If so, do it. */
2586 void __kmp_save_internal_controls(kmp_info_t *thread) {
2587 
2588  if (thread->th.th_team != thread->th.th_serial_team) {
2589  return;
2590  }
2591  if (thread->th.th_team->t.t_serialized > 1) {
2592  int push = 0;
2593 
2594  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2595  push = 1;
2596  } else {
2597  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2598  thread->th.th_team->t.t_serialized) {
2599  push = 1;
2600  }
2601  }
2602  if (push) { /* push a record on the serial team's stack */
2603  kmp_internal_control_t *control =
2604  (kmp_internal_control_t *)__kmp_allocate(
2605  sizeof(kmp_internal_control_t));
2606 
2607  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2608 
2609  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2610 
2611  control->next = thread->th.th_team->t.t_control_stack_top;
2612  thread->th.th_team->t.t_control_stack_top = control;
2613  }
2614  }
2615 }
2616 
2617 /* Changes set_nproc */
2618 void __kmp_set_num_threads(int new_nth, int gtid) {
2619  kmp_info_t *thread;
2620  kmp_root_t *root;
2621 
2622  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2623  KMP_DEBUG_ASSERT(__kmp_init_serial);
2624 
2625  if (new_nth < 1)
2626  new_nth = 1;
2627  else if (new_nth > __kmp_max_nth)
2628  new_nth = __kmp_max_nth;
2629 
2630  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2631  thread = __kmp_threads[gtid];
2632 
2633  __kmp_save_internal_controls(thread);
2634 
2635  set__nproc(thread, new_nth);
2636 
2637  // If this omp_set_num_threads() call will cause the hot team size to be
2638  // reduced (in the absence of a num_threads clause), then reduce it now,
2639  // rather than waiting for the next parallel region.
2640  root = thread->th.th_root;
2641  if (__kmp_init_parallel && (!root->r.r_active) &&
2642  (root->r.r_hot_team->t.t_nproc > new_nth)
2643 #if KMP_NESTED_HOT_TEAMS
2644  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2645 #endif
2646  ) {
2647  kmp_team_t *hot_team = root->r.r_hot_team;
2648  int f;
2649 
2650  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2651 
2652  // Release the extra threads we don't need any more.
2653  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2654  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2655  if (__kmp_tasking_mode != tskm_immediate_exec) {
2656  // When decreasing team size, threads no longer in the team should unref
2657  // task team.
2658  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2659  }
2660  __kmp_free_thread(hot_team->t.t_threads[f]);
2661  hot_team->t.t_threads[f] = NULL;
2662  }
2663  hot_team->t.t_nproc = new_nth;
2664 #if KMP_NESTED_HOT_TEAMS
2665  if (thread->th.th_hot_teams) {
2666  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2667  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2668  }
2669 #endif
2670 
2671  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2672 
2673  // Update the t_nproc field in the threads that are still active.
2674  for (f = 0; f < new_nth; f++) {
2675  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2676  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2677  }
2678  // Special flag in case omp_set_num_threads() call
2679  hot_team->t.t_size_changed = -1;
2680  }
2681 }
2682 
2683 /* Changes max_active_levels */
2684 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2685  kmp_info_t *thread;
2686 
2687  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2688  "%d = (%d)\n",
2689  gtid, max_active_levels));
2690  KMP_DEBUG_ASSERT(__kmp_init_serial);
2691 
2692  // validate max_active_levels
2693  if (max_active_levels < 0) {
2694  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2695  // We ignore this call if the user has specified a negative value.
2696  // The current setting won't be changed. The last valid setting will be
2697  // used. A warning will be issued (if warnings are allowed as controlled by
2698  // the KMP_WARNINGS env var).
2699  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2700  "max_active_levels for thread %d = (%d)\n",
2701  gtid, max_active_levels));
2702  return;
2703  }
2704  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2705  // it's OK, the max_active_levels is within the valid range: [ 0;
2706  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2707  // We allow a zero value. (implementation defined behavior)
2708  } else {
2709  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2710  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2711  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2712  // Current upper limit is MAX_INT. (implementation defined behavior)
2713  // If the input exceeds the upper limit, we correct the input to be the
2714  // upper limit. (implementation defined behavior)
2715  // Actually, the flow should never get here until we use MAX_INT limit.
2716  }
2717  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2718  "max_active_levels for thread %d = (%d)\n",
2719  gtid, max_active_levels));
2720 
2721  thread = __kmp_threads[gtid];
2722 
2723  __kmp_save_internal_controls(thread);
2724 
2725  set__max_active_levels(thread, max_active_levels);
2726 }
2727 
2728 /* Gets max_active_levels */
2729 int __kmp_get_max_active_levels(int gtid) {
2730  kmp_info_t *thread;
2731 
2732  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2733  KMP_DEBUG_ASSERT(__kmp_init_serial);
2734 
2735  thread = __kmp_threads[gtid];
2736  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2737  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2738  "curtask_maxaclevel=%d\n",
2739  gtid, thread->th.th_current_task,
2740  thread->th.th_current_task->td_icvs.max_active_levels));
2741  return thread->th.th_current_task->td_icvs.max_active_levels;
2742 }
2743 
2744 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2745 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2746  kmp_info_t *thread;
2747  // kmp_team_t *team;
2748 
2749  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2750  gtid, (int)kind, chunk));
2751  KMP_DEBUG_ASSERT(__kmp_init_serial);
2752 
2753  // Check if the kind parameter is valid, correct if needed.
2754  // Valid parameters should fit in one of two intervals - standard or extended:
2755  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2756  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2757  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2758  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2759  // TODO: Hint needs attention in case we change the default schedule.
2760  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2761  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2762  __kmp_msg_null);
2763  kind = kmp_sched_default;
2764  chunk = 0; // ignore chunk value in case of bad kind
2765  }
2766 
2767  thread = __kmp_threads[gtid];
2768 
2769  __kmp_save_internal_controls(thread);
2770 
2771  if (kind < kmp_sched_upper_std) {
2772  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2773  // differ static chunked vs. unchunked: chunk should be invalid to
2774  // indicate unchunked schedule (which is the default)
2775  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2776  } else {
2777  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2778  __kmp_sch_map[kind - kmp_sched_lower - 1];
2779  }
2780  } else {
2781  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2782  // kmp_sched_lower - 2 ];
2783  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2784  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2785  kmp_sched_lower - 2];
2786  }
2787  if (kind == kmp_sched_auto || chunk < 1) {
2788  // ignore parameter chunk for schedule auto
2789  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2790  } else {
2791  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2792  }
2793 }
2794 
2795 /* Gets def_sched_var ICV values */
2796 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2797  kmp_info_t *thread;
2798  enum sched_type th_type;
2799 
2800  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2801  KMP_DEBUG_ASSERT(__kmp_init_serial);
2802 
2803  thread = __kmp_threads[gtid];
2804 
2805  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2806 
2807  switch (th_type) {
2808  case kmp_sch_static:
2809  case kmp_sch_static_greedy:
2810  case kmp_sch_static_balanced:
2811  *kind = kmp_sched_static;
2812  *chunk = 0; // chunk was not set, try to show this fact via zero value
2813  return;
2814  case kmp_sch_static_chunked:
2815  *kind = kmp_sched_static;
2816  break;
2817  case kmp_sch_dynamic_chunked:
2818  *kind = kmp_sched_dynamic;
2819  break;
2821  case kmp_sch_guided_iterative_chunked:
2822  case kmp_sch_guided_analytical_chunked:
2823  *kind = kmp_sched_guided;
2824  break;
2825  case kmp_sch_auto:
2826  *kind = kmp_sched_auto;
2827  break;
2828  case kmp_sch_trapezoidal:
2829  *kind = kmp_sched_trapezoidal;
2830  break;
2831 #if KMP_STATIC_STEAL_ENABLED
2832  case kmp_sch_static_steal:
2833  *kind = kmp_sched_static_steal;
2834  break;
2835 #endif
2836  default:
2837  KMP_FATAL(UnknownSchedulingType, th_type);
2838  }
2839 
2840  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2841 }
2842 
2843 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2844 
2845  int ii, dd;
2846  kmp_team_t *team;
2847  kmp_info_t *thr;
2848 
2849  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2850  KMP_DEBUG_ASSERT(__kmp_init_serial);
2851 
2852  // validate level
2853  if (level == 0)
2854  return 0;
2855  if (level < 0)
2856  return -1;
2857  thr = __kmp_threads[gtid];
2858  team = thr->th.th_team;
2859  ii = team->t.t_level;
2860  if (level > ii)
2861  return -1;
2862 
2863 #if OMP_40_ENABLED
2864  if (thr->th.th_teams_microtask) {
2865  // AC: we are in teams region where multiple nested teams have same level
2866  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2867  if (level <=
2868  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2869  KMP_DEBUG_ASSERT(ii >= tlevel);
2870  // AC: As we need to pass by the teams league, we need to artificially
2871  // increase ii
2872  if (ii == tlevel) {
2873  ii += 2; // three teams have same level
2874  } else {
2875  ii++; // two teams have same level
2876  }
2877  }
2878  }
2879 #endif
2880 
2881  if (ii == level)
2882  return __kmp_tid_from_gtid(gtid);
2883 
2884  dd = team->t.t_serialized;
2885  level++;
2886  while (ii > level) {
2887  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2888  }
2889  if ((team->t.t_serialized) && (!dd)) {
2890  team = team->t.t_parent;
2891  continue;
2892  }
2893  if (ii > level) {
2894  team = team->t.t_parent;
2895  dd = team->t.t_serialized;
2896  ii--;
2897  }
2898  }
2899 
2900  return (dd > 1) ? (0) : (team->t.t_master_tid);
2901 }
2902 
2903 int __kmp_get_team_size(int gtid, int level) {
2904 
2905  int ii, dd;
2906  kmp_team_t *team;
2907  kmp_info_t *thr;
2908 
2909  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2910  KMP_DEBUG_ASSERT(__kmp_init_serial);
2911 
2912  // validate level
2913  if (level == 0)
2914  return 1;
2915  if (level < 0)
2916  return -1;
2917  thr = __kmp_threads[gtid];
2918  team = thr->th.th_team;
2919  ii = team->t.t_level;
2920  if (level > ii)
2921  return -1;
2922 
2923 #if OMP_40_ENABLED
2924  if (thr->th.th_teams_microtask) {
2925  // AC: we are in teams region where multiple nested teams have same level
2926  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2927  if (level <=
2928  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2929  KMP_DEBUG_ASSERT(ii >= tlevel);
2930  // AC: As we need to pass by the teams league, we need to artificially
2931  // increase ii
2932  if (ii == tlevel) {
2933  ii += 2; // three teams have same level
2934  } else {
2935  ii++; // two teams have same level
2936  }
2937  }
2938  }
2939 #endif
2940 
2941  while (ii > level) {
2942  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2943  }
2944  if (team->t.t_serialized && (!dd)) {
2945  team = team->t.t_parent;
2946  continue;
2947  }
2948  if (ii > level) {
2949  team = team->t.t_parent;
2950  ii--;
2951  }
2952  }
2953 
2954  return team->t.t_nproc;
2955 }
2956 
2957 kmp_r_sched_t __kmp_get_schedule_global() {
2958  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2959  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2960  // independently. So one can get the updated schedule here.
2961 
2962  kmp_r_sched_t r_sched;
2963 
2964  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2965  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2966  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2967  // different roots (even in OMP 2.5)
2968  if (__kmp_sched == kmp_sch_static) {
2969  // replace STATIC with more detailed schedule (balanced or greedy)
2970  r_sched.r_sched_type = __kmp_static;
2971  } else if (__kmp_sched == kmp_sch_guided_chunked) {
2972  // replace GUIDED with more detailed schedule (iterative or analytical)
2973  r_sched.r_sched_type = __kmp_guided;
2974  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2975  r_sched.r_sched_type = __kmp_sched;
2976  }
2977 
2978  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2979  // __kmp_chunk may be wrong here (if it was not ever set)
2980  r_sched.chunk = KMP_DEFAULT_CHUNK;
2981  } else {
2982  r_sched.chunk = __kmp_chunk;
2983  }
2984 
2985  return r_sched;
2986 }
2987 
2988 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2989  at least argc number of *t_argv entries for the requested team. */
2990 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2991 
2992  KMP_DEBUG_ASSERT(team);
2993  if (!realloc || argc > team->t.t_max_argc) {
2994 
2995  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2996  "current entries=%d\n",
2997  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2998  /* if previously allocated heap space for args, free them */
2999  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3000  __kmp_free((void *)team->t.t_argv);
3001 
3002  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3003  /* use unused space in the cache line for arguments */
3004  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3005  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3006  "argv entries\n",
3007  team->t.t_id, team->t.t_max_argc));
3008  team->t.t_argv = &team->t.t_inline_argv[0];
3009  if (__kmp_storage_map) {
3010  __kmp_print_storage_map_gtid(
3011  -1, &team->t.t_inline_argv[0],
3012  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3013  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3014  team->t.t_id);
3015  }
3016  } else {
3017  /* allocate space for arguments in the heap */
3018  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3019  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3020  : 2 * argc;
3021  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3022  "argv entries\n",
3023  team->t.t_id, team->t.t_max_argc));
3024  team->t.t_argv =
3025  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3026  if (__kmp_storage_map) {
3027  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3028  &team->t.t_argv[team->t.t_max_argc],
3029  sizeof(void *) * team->t.t_max_argc,
3030  "team_%d.t_argv", team->t.t_id);
3031  }
3032  }
3033  }
3034 }
3035 
3036 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3037  int i;
3038  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3039  team->t.t_threads =
3040  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3041  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3042  sizeof(dispatch_shared_info_t) * num_disp_buff);
3043  team->t.t_dispatch =
3044  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3045  team->t.t_implicit_task_taskdata =
3046  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3047  team->t.t_max_nproc = max_nth;
3048 
3049  /* setup dispatch buffers */
3050  for (i = 0; i < num_disp_buff; ++i) {
3051  team->t.t_disp_buffer[i].buffer_index = i;
3052 #if OMP_45_ENABLED
3053  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3054 #endif
3055  }
3056 }
3057 
3058 static void __kmp_free_team_arrays(kmp_team_t *team) {
3059  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3060  int i;
3061  for (i = 0; i < team->t.t_max_nproc; ++i) {
3062  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3063  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3064  team->t.t_dispatch[i].th_disp_buffer = NULL;
3065  }
3066  }
3067 #if KMP_USE_HIER_SCHED
3068  __kmp_dispatch_free_hierarchies(team);
3069 #endif
3070  __kmp_free(team->t.t_threads);
3071  __kmp_free(team->t.t_disp_buffer);
3072  __kmp_free(team->t.t_dispatch);
3073  __kmp_free(team->t.t_implicit_task_taskdata);
3074  team->t.t_threads = NULL;
3075  team->t.t_disp_buffer = NULL;
3076  team->t.t_dispatch = NULL;
3077  team->t.t_implicit_task_taskdata = 0;
3078 }
3079 
3080 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3081  kmp_info_t **oldThreads = team->t.t_threads;
3082 
3083  __kmp_free(team->t.t_disp_buffer);
3084  __kmp_free(team->t.t_dispatch);
3085  __kmp_free(team->t.t_implicit_task_taskdata);
3086  __kmp_allocate_team_arrays(team, max_nth);
3087 
3088  KMP_MEMCPY(team->t.t_threads, oldThreads,
3089  team->t.t_nproc * sizeof(kmp_info_t *));
3090 
3091  __kmp_free(oldThreads);
3092 }
3093 
3094 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3095 
3096  kmp_r_sched_t r_sched =
3097  __kmp_get_schedule_global(); // get current state of scheduling globals
3098 
3099 #if OMP_40_ENABLED
3100  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3101 #endif /* OMP_40_ENABLED */
3102 
3103  kmp_internal_control_t g_icvs = {
3104  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3105  (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3106  // for nested parallelism (per thread)
3107  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3108  // adjustment of threads (per thread)
3109  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3110  // whether blocktime is explicitly set
3111  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3112 #if KMP_USE_MONITOR
3113  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3114 // intervals
3115 #endif
3116  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3117  // next parallel region (per thread)
3118  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3119  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3120  // for max_active_levels
3121  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3122 // {sched,chunk} pair
3123 #if OMP_40_ENABLED
3124  __kmp_nested_proc_bind.bind_types[0],
3125  __kmp_default_device,
3126 #endif /* OMP_40_ENABLED */
3127  NULL // struct kmp_internal_control *next;
3128  };
3129 
3130  return g_icvs;
3131 }
3132 
3133 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3134 
3135  kmp_internal_control_t gx_icvs;
3136  gx_icvs.serial_nesting_level =
3137  0; // probably =team->t.t_serial like in save_inter_controls
3138  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3139  gx_icvs.next = NULL;
3140 
3141  return gx_icvs;
3142 }
3143 
3144 static void __kmp_initialize_root(kmp_root_t *root) {
3145  int f;
3146  kmp_team_t *root_team;
3147  kmp_team_t *hot_team;
3148  int hot_team_max_nth;
3149  kmp_r_sched_t r_sched =
3150  __kmp_get_schedule_global(); // get current state of scheduling globals
3151  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3152  KMP_DEBUG_ASSERT(root);
3153  KMP_ASSERT(!root->r.r_begin);
3154 
3155  /* setup the root state structure */
3156  __kmp_init_lock(&root->r.r_begin_lock);
3157  root->r.r_begin = FALSE;
3158  root->r.r_active = FALSE;
3159  root->r.r_in_parallel = 0;
3160  root->r.r_blocktime = __kmp_dflt_blocktime;
3161  root->r.r_nested = __kmp_dflt_nested;
3162  root->r.r_cg_nthreads = 1;
3163 
3164  /* setup the root team for this task */
3165  /* allocate the root team structure */
3166  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3167 
3168  root_team =
3169  __kmp_allocate_team(root,
3170  1, // new_nproc
3171  1, // max_nproc
3172 #if OMPT_SUPPORT
3173  ompt_data_none, // root parallel id
3174 #endif
3175 #if OMP_40_ENABLED
3176  __kmp_nested_proc_bind.bind_types[0],
3177 #endif
3178  &r_icvs,
3179  0 // argc
3180  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3181  );
3182 #if USE_DEBUGGER
3183  // Non-NULL value should be assigned to make the debugger display the root
3184  // team.
3185  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3186 #endif
3187 
3188  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3189 
3190  root->r.r_root_team = root_team;
3191  root_team->t.t_control_stack_top = NULL;
3192 
3193  /* initialize root team */
3194  root_team->t.t_threads[0] = NULL;
3195  root_team->t.t_nproc = 1;
3196  root_team->t.t_serialized = 1;
3197  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3198  root_team->t.t_sched.sched = r_sched.sched;
3199  KA_TRACE(
3200  20,
3201  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3202  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3203 
3204  /* setup the hot team for this task */
3205  /* allocate the hot team structure */
3206  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3207 
3208  hot_team =
3209  __kmp_allocate_team(root,
3210  1, // new_nproc
3211  __kmp_dflt_team_nth_ub * 2, // max_nproc
3212 #if OMPT_SUPPORT
3213  ompt_data_none, // root parallel id
3214 #endif
3215 #if OMP_40_ENABLED
3216  __kmp_nested_proc_bind.bind_types[0],
3217 #endif
3218  &r_icvs,
3219  0 // argc
3220  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3221  );
3222  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3223 
3224  root->r.r_hot_team = hot_team;
3225  root_team->t.t_control_stack_top = NULL;
3226 
3227  /* first-time initialization */
3228  hot_team->t.t_parent = root_team;
3229 
3230  /* initialize hot team */
3231  hot_team_max_nth = hot_team->t.t_max_nproc;
3232  for (f = 0; f < hot_team_max_nth; ++f) {
3233  hot_team->t.t_threads[f] = NULL;
3234  }
3235  hot_team->t.t_nproc = 1;
3236  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3237  hot_team->t.t_sched.sched = r_sched.sched;
3238  hot_team->t.t_size_changed = 0;
3239 }
3240 
3241 #ifdef KMP_DEBUG
3242 
3243 typedef struct kmp_team_list_item {
3244  kmp_team_p const *entry;
3245  struct kmp_team_list_item *next;
3246 } kmp_team_list_item_t;
3247 typedef kmp_team_list_item_t *kmp_team_list_t;
3248 
3249 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3250  kmp_team_list_t list, // List of teams.
3251  kmp_team_p const *team // Team to add.
3252  ) {
3253 
3254  // List must terminate with item where both entry and next are NULL.
3255  // Team is added to the list only once.
3256  // List is sorted in ascending order by team id.
3257  // Team id is *not* a key.
3258 
3259  kmp_team_list_t l;
3260 
3261  KMP_DEBUG_ASSERT(list != NULL);
3262  if (team == NULL) {
3263  return;
3264  }
3265 
3266  __kmp_print_structure_team_accum(list, team->t.t_parent);
3267  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3268 
3269  // Search list for the team.
3270  l = list;
3271  while (l->next != NULL && l->entry != team) {
3272  l = l->next;
3273  }
3274  if (l->next != NULL) {
3275  return; // Team has been added before, exit.
3276  }
3277 
3278  // Team is not found. Search list again for insertion point.
3279  l = list;
3280  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3281  l = l->next;
3282  }
3283 
3284  // Insert team.
3285  {
3286  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3287  sizeof(kmp_team_list_item_t));
3288  *item = *l;
3289  l->entry = team;
3290  l->next = item;
3291  }
3292 }
3293 
3294 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3295 
3296  ) {
3297  __kmp_printf("%s", title);
3298  if (team != NULL) {
3299  __kmp_printf("%2x %p\n", team->t.t_id, team);
3300  } else {
3301  __kmp_printf(" - (nil)\n");
3302  }
3303 }
3304 
3305 static void __kmp_print_structure_thread(char const *title,
3306  kmp_info_p const *thread) {
3307  __kmp_printf("%s", title);
3308  if (thread != NULL) {
3309  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3310  } else {
3311  __kmp_printf(" - (nil)\n");
3312  }
3313 }
3314 
3315 void __kmp_print_structure(void) {
3316 
3317  kmp_team_list_t list;
3318 
3319  // Initialize list of teams.
3320  list =
3321  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3322  list->entry = NULL;
3323  list->next = NULL;
3324 
3325  __kmp_printf("\n------------------------------\nGlobal Thread "
3326  "Table\n------------------------------\n");
3327  {
3328  int gtid;
3329  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3330  __kmp_printf("%2d", gtid);
3331  if (__kmp_threads != NULL) {
3332  __kmp_printf(" %p", __kmp_threads[gtid]);
3333  }
3334  if (__kmp_root != NULL) {
3335  __kmp_printf(" %p", __kmp_root[gtid]);
3336  }
3337  __kmp_printf("\n");
3338  }
3339  }
3340 
3341  // Print out __kmp_threads array.
3342  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3343  "----------\n");
3344  if (__kmp_threads != NULL) {
3345  int gtid;
3346  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3347  kmp_info_t const *thread = __kmp_threads[gtid];
3348  if (thread != NULL) {
3349  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3350  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3351  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3352  __kmp_print_structure_team(" Serial Team: ",
3353  thread->th.th_serial_team);
3354  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3355  __kmp_print_structure_thread(" Master: ",
3356  thread->th.th_team_master);
3357  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3358  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3359 #if OMP_40_ENABLED
3360  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3361 #endif
3362  __kmp_print_structure_thread(" Next in pool: ",
3363  thread->th.th_next_pool);
3364  __kmp_printf("\n");
3365  __kmp_print_structure_team_accum(list, thread->th.th_team);
3366  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3367  }
3368  }
3369  } else {
3370  __kmp_printf("Threads array is not allocated.\n");
3371  }
3372 
3373  // Print out __kmp_root array.
3374  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3375  "--------\n");
3376  if (__kmp_root != NULL) {
3377  int gtid;
3378  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3379  kmp_root_t const *root = __kmp_root[gtid];
3380  if (root != NULL) {
3381  __kmp_printf("GTID %2d %p:\n", gtid, root);
3382  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3383  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3384  __kmp_print_structure_thread(" Uber Thread: ",
3385  root->r.r_uber_thread);
3386  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3387  __kmp_printf(" Nested?: %2d\n", root->r.r_nested);
3388  __kmp_printf(" In Parallel: %2d\n",
3389  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3390  __kmp_printf("\n");
3391  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3392  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3393  }
3394  }
3395  } else {
3396  __kmp_printf("Ubers array is not allocated.\n");
3397  }
3398 
3399  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3400  "--------\n");
3401  while (list->next != NULL) {
3402  kmp_team_p const *team = list->entry;
3403  int i;
3404  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3405  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3406  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3407  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3408  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3409  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3410  for (i = 0; i < team->t.t_nproc; ++i) {
3411  __kmp_printf(" Thread %2d: ", i);
3412  __kmp_print_structure_thread("", team->t.t_threads[i]);
3413  }
3414  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3415  __kmp_printf("\n");
3416  list = list->next;
3417  }
3418 
3419  // Print out __kmp_thread_pool and __kmp_team_pool.
3420  __kmp_printf("\n------------------------------\nPools\n----------------------"
3421  "--------\n");
3422  __kmp_print_structure_thread("Thread pool: ",
3423  CCAST(kmp_info_t *, __kmp_thread_pool));
3424  __kmp_print_structure_team("Team pool: ",
3425  CCAST(kmp_team_t *, __kmp_team_pool));
3426  __kmp_printf("\n");
3427 
3428  // Free team list.
3429  while (list != NULL) {
3430  kmp_team_list_item_t *item = list;
3431  list = list->next;
3432  KMP_INTERNAL_FREE(item);
3433  }
3434 }
3435 
3436 #endif
3437 
3438 //---------------------------------------------------------------------------
3439 // Stuff for per-thread fast random number generator
3440 // Table of primes
3441 static const unsigned __kmp_primes[] = {
3442  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3443  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3444  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3445  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3446  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3447  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3448  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3449  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3450  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3451  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3452  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3453 
3454 //---------------------------------------------------------------------------
3455 // __kmp_get_random: Get a random number using a linear congruential method.
3456 unsigned short __kmp_get_random(kmp_info_t *thread) {
3457  unsigned x = thread->th.th_x;
3458  unsigned short r = x >> 16;
3459 
3460  thread->th.th_x = x * thread->th.th_a + 1;
3461 
3462  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3463  thread->th.th_info.ds.ds_tid, r));
3464 
3465  return r;
3466 }
3467 //--------------------------------------------------------
3468 // __kmp_init_random: Initialize a random number generator
3469 void __kmp_init_random(kmp_info_t *thread) {
3470  unsigned seed = thread->th.th_info.ds.ds_tid;
3471 
3472  thread->th.th_a =
3473  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3474  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3475  KA_TRACE(30,
3476  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3477 }
3478 
3479 #if KMP_OS_WINDOWS
3480 /* reclaim array entries for root threads that are already dead, returns number
3481  * reclaimed */
3482 static int __kmp_reclaim_dead_roots(void) {
3483  int i, r = 0;
3484 
3485  for (i = 0; i < __kmp_threads_capacity; ++i) {
3486  if (KMP_UBER_GTID(i) &&
3487  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3488  !__kmp_root[i]
3489  ->r.r_active) { // AC: reclaim only roots died in non-active state
3490  r += __kmp_unregister_root_other_thread(i);
3491  }
3492  }
3493  return r;
3494 }
3495 #endif
3496 
3497 /* This function attempts to create free entries in __kmp_threads and
3498  __kmp_root, and returns the number of free entries generated.
3499 
3500  For Windows* OS static library, the first mechanism used is to reclaim array
3501  entries for root threads that are already dead.
3502 
3503  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3504  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3505  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3506  threadprivate cache array has been created. Synchronization with
3507  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3508 
3509  After any dead root reclamation, if the clipping value allows array expansion
3510  to result in the generation of a total of nNeed free slots, the function does
3511  that expansion. If not, nothing is done beyond the possible initial root
3512  thread reclamation.
3513 
3514  If any argument is negative, the behavior is undefined. */
3515 static int __kmp_expand_threads(int nNeed) {
3516  int added = 0;
3517  int minimumRequiredCapacity;
3518  int newCapacity;
3519  kmp_info_t **newThreads;
3520  kmp_root_t **newRoot;
3521 
3522 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3523 // resizing __kmp_threads does not need additional protection if foreign
3524 // threads are present
3525 
3526 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3527  /* only for Windows static library */
3528  /* reclaim array entries for root threads that are already dead */
3529  added = __kmp_reclaim_dead_roots();
3530 
3531  if (nNeed) {
3532  nNeed -= added;
3533  if (nNeed < 0)
3534  nNeed = 0;
3535  }
3536 #endif
3537  if (nNeed <= 0)
3538  return added;
3539 
3540  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3541  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3542  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3543  // > __kmp_max_nth in one of two ways:
3544  //
3545  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3546  // may not be resused by another thread, so we may need to increase
3547  // __kmp_threads_capacity to __kmp_max_nth + 1.
3548  //
3549  // 2) New foreign root(s) are encountered. We always register new foreign
3550  // roots. This may cause a smaller # of threads to be allocated at
3551  // subsequent parallel regions, but the worker threads hang around (and
3552  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3553  //
3554  // Anyway, that is the reason for moving the check to see if
3555  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3556  // instead of having it performed here. -BB
3557 
3558  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3559 
3560  /* compute expansion headroom to check if we can expand */
3561  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3562  /* possible expansion too small -- give up */
3563  return added;
3564  }
3565  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3566 
3567  newCapacity = __kmp_threads_capacity;
3568  do {
3569  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3570  : __kmp_sys_max_nth;
3571  } while (newCapacity < minimumRequiredCapacity);
3572  newThreads = (kmp_info_t **)__kmp_allocate(
3573  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3574  newRoot =
3575  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3576  KMP_MEMCPY(newThreads, __kmp_threads,
3577  __kmp_threads_capacity * sizeof(kmp_info_t *));
3578  KMP_MEMCPY(newRoot, __kmp_root,
3579  __kmp_threads_capacity * sizeof(kmp_root_t *));
3580 
3581  kmp_info_t **temp_threads = __kmp_threads;
3582  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3583  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3584  __kmp_free(temp_threads);
3585  added += newCapacity - __kmp_threads_capacity;
3586  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3587 
3588  if (newCapacity > __kmp_tp_capacity) {
3589  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3590  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3591  __kmp_threadprivate_resize_cache(newCapacity);
3592  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3593  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3594  }
3595  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3596  }
3597 
3598  return added;
3599 }
3600 
3601 /* Register the current thread as a root thread and obtain our gtid. We must
3602  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3603  thread that calls from __kmp_do_serial_initialize() */
3604 int __kmp_register_root(int initial_thread) {
3605  kmp_info_t *root_thread;
3606  kmp_root_t *root;
3607  int gtid;
3608  int capacity;
3609  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3610  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3611  KMP_MB();
3612 
3613  /* 2007-03-02:
3614  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3615  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3616  work as expected -- it may return false (that means there is at least one
3617  empty slot in __kmp_threads array), but it is possible the only free slot
3618  is #0, which is reserved for initial thread and so cannot be used for this
3619  one. Following code workarounds this bug.
3620 
3621  However, right solution seems to be not reserving slot #0 for initial
3622  thread because:
3623  (1) there is no magic in slot #0,
3624  (2) we cannot detect initial thread reliably (the first thread which does
3625  serial initialization may be not a real initial thread).
3626  */
3627  capacity = __kmp_threads_capacity;
3628  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3629  --capacity;
3630  }
3631 
3632  /* see if there are too many threads */
3633  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3634  if (__kmp_tp_cached) {
3635  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3636  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3637  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3638  } else {
3639  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3640  __kmp_msg_null);
3641  }
3642  }
3643 
3644  /* find an available thread slot */
3645  /* Don't reassign the zero slot since we need that to only be used by initial
3646  thread */
3647  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3648  gtid++)
3649  ;
3650  KA_TRACE(1,
3651  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3652  KMP_ASSERT(gtid < __kmp_threads_capacity);
3653 
3654  /* update global accounting */
3655  __kmp_all_nth++;
3656  TCW_4(__kmp_nth, __kmp_nth + 1);
3657 
3658  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3659  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3660  if (__kmp_adjust_gtid_mode) {
3661  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3662  if (TCR_4(__kmp_gtid_mode) != 2) {
3663  TCW_4(__kmp_gtid_mode, 2);
3664  }
3665  } else {
3666  if (TCR_4(__kmp_gtid_mode) != 1) {
3667  TCW_4(__kmp_gtid_mode, 1);
3668  }
3669  }
3670  }
3671 
3672 #ifdef KMP_ADJUST_BLOCKTIME
3673  /* Adjust blocktime to zero if necessary */
3674  /* Middle initialization might not have occurred yet */
3675  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3676  if (__kmp_nth > __kmp_avail_proc) {
3677  __kmp_zero_bt = TRUE;
3678  }
3679  }
3680 #endif /* KMP_ADJUST_BLOCKTIME */
3681 
3682  /* setup this new hierarchy */
3683  if (!(root = __kmp_root[gtid])) {
3684  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3685  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3686  }
3687 
3688 #if KMP_STATS_ENABLED
3689  // Initialize stats as soon as possible (right after gtid assignment).
3690  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3691  __kmp_stats_thread_ptr->startLife();
3692  KMP_SET_THREAD_STATE(SERIAL_REGION);
3693  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3694 #endif
3695  __kmp_initialize_root(root);
3696 
3697  /* setup new root thread structure */
3698  if (root->r.r_uber_thread) {
3699  root_thread = root->r.r_uber_thread;
3700  } else {
3701  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3702  if (__kmp_storage_map) {
3703  __kmp_print_thread_storage_map(root_thread, gtid);
3704  }
3705  root_thread->th.th_info.ds.ds_gtid = gtid;
3706 #if OMPT_SUPPORT
3707  root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3708 #endif
3709  root_thread->th.th_root = root;
3710  if (__kmp_env_consistency_check) {
3711  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3712  }
3713 #if USE_FAST_MEMORY
3714  __kmp_initialize_fast_memory(root_thread);
3715 #endif /* USE_FAST_MEMORY */
3716 
3717 #if KMP_USE_BGET
3718  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3719  __kmp_initialize_bget(root_thread);
3720 #endif
3721  __kmp_init_random(root_thread); // Initialize random number generator
3722  }
3723 
3724  /* setup the serial team held in reserve by the root thread */
3725  if (!root_thread->th.th_serial_team) {
3726  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3727  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3728  root_thread->th.th_serial_team =
3729  __kmp_allocate_team(root, 1, 1,
3730 #if OMPT_SUPPORT
3731  ompt_data_none, // root parallel id
3732 #endif
3733 #if OMP_40_ENABLED
3734  proc_bind_default,
3735 #endif
3736  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3737  }
3738  KMP_ASSERT(root_thread->th.th_serial_team);
3739  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3740  root_thread->th.th_serial_team));
3741 
3742  /* drop root_thread into place */
3743  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3744 
3745  root->r.r_root_team->t.t_threads[0] = root_thread;
3746  root->r.r_hot_team->t.t_threads[0] = root_thread;
3747  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3748  // AC: the team created in reserve, not for execution (it is unused for now).
3749  root_thread->th.th_serial_team->t.t_serialized = 0;
3750  root->r.r_uber_thread = root_thread;
3751 
3752  /* initialize the thread, get it ready to go */
3753  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3754  TCW_4(__kmp_init_gtid, TRUE);
3755 
3756  /* prepare the master thread for get_gtid() */
3757  __kmp_gtid_set_specific(gtid);
3758 
3759 #if USE_ITT_BUILD
3760  __kmp_itt_thread_name(gtid);
3761 #endif /* USE_ITT_BUILD */
3762 
3763 #ifdef KMP_TDATA_GTID
3764  __kmp_gtid = gtid;
3765 #endif
3766  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3767  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3768 
3769  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3770  "plain=%u\n",
3771  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3772  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3773  KMP_INIT_BARRIER_STATE));
3774  { // Initialize barrier data.
3775  int b;
3776  for (b = 0; b < bs_last_barrier; ++b) {
3777  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3778 #if USE_DEBUGGER
3779  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3780 #endif
3781  }
3782  }
3783  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3784  KMP_INIT_BARRIER_STATE);
3785 
3786 #if KMP_AFFINITY_SUPPORTED
3787 #if OMP_40_ENABLED
3788  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3789  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3790  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3791  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3792 #endif
3793 
3794  if (TCR_4(__kmp_init_middle)) {
3795  __kmp_affinity_set_init_mask(gtid, TRUE);
3796  }
3797 #endif /* KMP_AFFINITY_SUPPORTED */
3798 
3799  __kmp_root_counter++;
3800 
3801 #if OMPT_SUPPORT
3802  if (!initial_thread && ompt_enabled.enabled) {
3803 
3804  ompt_thread_t *root_thread = ompt_get_thread();
3805 
3806  ompt_set_thread_state(root_thread, omp_state_overhead);
3807 
3808  if (ompt_enabled.ompt_callback_thread_begin) {
3809  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3810  ompt_thread_initial, __ompt_get_thread_data_internal());
3811  }
3812  ompt_data_t *task_data;
3813  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3814  if (ompt_enabled.ompt_callback_task_create) {
3815  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3816  NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3817  // initial task has nothing to return to
3818  }
3819 
3820  ompt_set_thread_state(root_thread, omp_state_work_serial);
3821  }
3822 #endif
3823 
3824  KMP_MB();
3825  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3826 
3827  return gtid;
3828 }
3829 
3830 #if KMP_NESTED_HOT_TEAMS
3831 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3832  const int max_level) {
3833  int i, n, nth;
3834  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3835  if (!hot_teams || !hot_teams[level].hot_team) {
3836  return 0;
3837  }
3838  KMP_DEBUG_ASSERT(level < max_level);
3839  kmp_team_t *team = hot_teams[level].hot_team;
3840  nth = hot_teams[level].hot_team_nth;
3841  n = nth - 1; // master is not freed
3842  if (level < max_level - 1) {
3843  for (i = 0; i < nth; ++i) {
3844  kmp_info_t *th = team->t.t_threads[i];
3845  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3846  if (i > 0 && th->th.th_hot_teams) {
3847  __kmp_free(th->th.th_hot_teams);
3848  th->th.th_hot_teams = NULL;
3849  }
3850  }
3851  }
3852  __kmp_free_team(root, team, NULL);
3853  return n;
3854 }
3855 #endif
3856 
3857 // Resets a root thread and clear its root and hot teams.
3858 // Returns the number of __kmp_threads entries directly and indirectly freed.
3859 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3860  kmp_team_t *root_team = root->r.r_root_team;
3861  kmp_team_t *hot_team = root->r.r_hot_team;
3862  int n = hot_team->t.t_nproc;
3863  int i;
3864 
3865  KMP_DEBUG_ASSERT(!root->r.r_active);
3866 
3867  root->r.r_root_team = NULL;
3868  root->r.r_hot_team = NULL;
3869  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3870  // before call to __kmp_free_team().
3871  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3872 #if KMP_NESTED_HOT_TEAMS
3873  if (__kmp_hot_teams_max_level >
3874  0) { // need to free nested hot teams and their threads if any
3875  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3876  kmp_info_t *th = hot_team->t.t_threads[i];
3877  if (__kmp_hot_teams_max_level > 1) {
3878  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3879  }
3880  if (th->th.th_hot_teams) {
3881  __kmp_free(th->th.th_hot_teams);
3882  th->th.th_hot_teams = NULL;
3883  }
3884  }
3885  }
3886 #endif
3887  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3888 
3889  // Before we can reap the thread, we need to make certain that all other
3890  // threads in the teams that had this root as ancestor have stopped trying to
3891  // steal tasks.
3892  if (__kmp_tasking_mode != tskm_immediate_exec) {
3893  __kmp_wait_to_unref_task_teams();
3894  }
3895 
3896 #if KMP_OS_WINDOWS
3897  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3898  KA_TRACE(
3899  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3900  "\n",
3901  (LPVOID) & (root->r.r_uber_thread->th),
3902  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3903  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3904 #endif /* KMP_OS_WINDOWS */
3905 
3906 #if OMPT_SUPPORT
3907  if (ompt_enabled.ompt_callback_thread_end) {
3908  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3909  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3910  }
3911 #endif
3912 
3913  TCW_4(__kmp_nth,
3914  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3915  root->r.r_cg_nthreads--;
3916 
3917  __kmp_reap_thread(root->r.r_uber_thread, 1);
3918 
3919  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3920  // of freeing.
3921  root->r.r_uber_thread = NULL;
3922  /* mark root as no longer in use */
3923  root->r.r_begin = FALSE;
3924 
3925  return n;
3926 }
3927 
3928 void __kmp_unregister_root_current_thread(int gtid) {
3929  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3930  /* this lock should be ok, since unregister_root_current_thread is never
3931  called during an abort, only during a normal close. furthermore, if you
3932  have the forkjoin lock, you should never try to get the initz lock */
3933  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3934  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3935  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3936  "exiting T#%d\n",
3937  gtid));
3938  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3939  return;
3940  }
3941  kmp_root_t *root = __kmp_root[gtid];
3942 
3943  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3944  KMP_ASSERT(KMP_UBER_GTID(gtid));
3945  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3946  KMP_ASSERT(root->r.r_active == FALSE);
3947 
3948  KMP_MB();
3949 
3950 #if OMP_45_ENABLED
3951  kmp_info_t *thread = __kmp_threads[gtid];
3952  kmp_team_t *team = thread->th.th_team;
3953  kmp_task_team_t *task_team = thread->th.th_task_team;
3954 
3955  // we need to wait for the proxy tasks before finishing the thread
3956  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3957 #if OMPT_SUPPORT
3958  // the runtime is shutting down so we won't report any events
3959  thread->th.ompt_thread_info.state = omp_state_undefined;
3960 #endif
3961  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3962  }
3963 #endif
3964 
3965  __kmp_reset_root(gtid, root);
3966 
3967  /* free up this thread slot */
3968  __kmp_gtid_set_specific(KMP_GTID_DNE);
3969 #ifdef KMP_TDATA_GTID
3970  __kmp_gtid = KMP_GTID_DNE;
3971 #endif
3972 
3973  KMP_MB();
3974  KC_TRACE(10,
3975  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3976 
3977  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3978 }
3979 
3980 #if KMP_OS_WINDOWS
3981 /* __kmp_forkjoin_lock must be already held
3982  Unregisters a root thread that is not the current thread. Returns the number
3983  of __kmp_threads entries freed as a result. */
3984 static int __kmp_unregister_root_other_thread(int gtid) {
3985  kmp_root_t *root = __kmp_root[gtid];
3986  int r;
3987 
3988  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3989  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3990  KMP_ASSERT(KMP_UBER_GTID(gtid));
3991  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3992  KMP_ASSERT(root->r.r_active == FALSE);
3993 
3994  r = __kmp_reset_root(gtid, root);
3995  KC_TRACE(10,
3996  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3997  return r;
3998 }
3999 #endif
4000 
4001 #if KMP_DEBUG
4002 void __kmp_task_info() {
4003 
4004  kmp_int32 gtid = __kmp_entry_gtid();
4005  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4006  kmp_info_t *this_thr = __kmp_threads[gtid];
4007  kmp_team_t *steam = this_thr->th.th_serial_team;
4008  kmp_team_t *team = this_thr->th.th_team;
4009 
4010  __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4011  "ptask=%p\n",
4012  gtid, tid, this_thr, team, this_thr->th.th_current_task,
4013  team->t.t_implicit_task_taskdata[tid].td_parent);
4014 }
4015 #endif // KMP_DEBUG
4016 
4017 /* TODO optimize with one big memclr, take out what isn't needed, split
4018  responsibility to workers as much as possible, and delay initialization of
4019  features as much as possible */
4020 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4021  int tid, int gtid) {
4022  /* this_thr->th.th_info.ds.ds_gtid is setup in
4023  kmp_allocate_thread/create_worker.
4024  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4025  kmp_info_t *master = team->t.t_threads[0];
4026  KMP_DEBUG_ASSERT(this_thr != NULL);
4027  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4028  KMP_DEBUG_ASSERT(team);
4029  KMP_DEBUG_ASSERT(team->t.t_threads);
4030  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4031  KMP_DEBUG_ASSERT(master);
4032  KMP_DEBUG_ASSERT(master->th.th_root);
4033 
4034  KMP_MB();
4035 
4036  TCW_SYNC_PTR(this_thr->th.th_team, team);
4037 
4038  this_thr->th.th_info.ds.ds_tid = tid;
4039  this_thr->th.th_set_nproc = 0;
4040  if (__kmp_tasking_mode != tskm_immediate_exec)
4041  // When tasking is possible, threads are not safe to reap until they are
4042  // done tasking; this will be set when tasking code is exited in wait
4043  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4044  else // no tasking --> always safe to reap
4045  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4046 #if OMP_40_ENABLED
4047  this_thr->th.th_set_proc_bind = proc_bind_default;
4048 #if KMP_AFFINITY_SUPPORTED
4049  this_thr->th.th_new_place = this_thr->th.th_current_place;
4050 #endif
4051 #endif
4052  this_thr->th.th_root = master->th.th_root;
4053 
4054  /* setup the thread's cache of the team structure */
4055  this_thr->th.th_team_nproc = team->t.t_nproc;
4056  this_thr->th.th_team_master = master;
4057  this_thr->th.th_team_serialized = team->t.t_serialized;
4058  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4059 
4060  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4061 
4062  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4063  tid, gtid, this_thr, this_thr->th.th_current_task));
4064 
4065  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4066  team, tid, TRUE);
4067 
4068  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4069  tid, gtid, this_thr, this_thr->th.th_current_task));
4070  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4071  // __kmp_initialize_team()?
4072 
4073  /* TODO no worksharing in speculative threads */
4074  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4075 
4076  this_thr->th.th_local.this_construct = 0;
4077 
4078  if (!this_thr->th.th_pri_common) {
4079  this_thr->th.th_pri_common =
4080  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4081  if (__kmp_storage_map) {
4082  __kmp_print_storage_map_gtid(
4083  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4084  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4085  }
4086  this_thr->th.th_pri_head = NULL;
4087  }
4088 
4089  /* Initialize dynamic dispatch */
4090  {
4091  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4092  // Use team max_nproc since this will never change for the team.
4093  size_t disp_size =
4094  sizeof(dispatch_private_info_t) *
4095  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4096  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4097  team->t.t_max_nproc));
4098  KMP_ASSERT(dispatch);
4099  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4100  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4101 
4102  dispatch->th_disp_index = 0;
4103 #if OMP_45_ENABLED
4104  dispatch->th_doacross_buf_idx = 0;
4105 #endif
4106  if (!dispatch->th_disp_buffer) {
4107  dispatch->th_disp_buffer =
4108  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4109 
4110  if (__kmp_storage_map) {
4111  __kmp_print_storage_map_gtid(
4112  gtid, &dispatch->th_disp_buffer[0],
4113  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4114  ? 1
4115  : __kmp_dispatch_num_buffers],
4116  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4117  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4118  gtid, team->t.t_id, gtid);
4119  }
4120  } else {
4121  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4122  }
4123 
4124  dispatch->th_dispatch_pr_current = 0;
4125  dispatch->th_dispatch_sh_current = 0;
4126 
4127  dispatch->th_deo_fcn = 0; /* ORDERED */
4128  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4129  }
4130 
4131  this_thr->th.th_next_pool = NULL;
4132 
4133  if (!this_thr->th.th_task_state_memo_stack) {
4134  size_t i;
4135  this_thr->th.th_task_state_memo_stack =
4136  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4137  this_thr->th.th_task_state_top = 0;
4138  this_thr->th.th_task_state_stack_sz = 4;
4139  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4140  ++i) // zero init the stack
4141  this_thr->th.th_task_state_memo_stack[i] = 0;
4142  }
4143 
4144  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4145  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4146 
4147  KMP_MB();
4148 }
4149 
4150 /* allocate a new thread for the requesting team. this is only called from
4151  within a forkjoin critical section. we will first try to get an available
4152  thread from the thread pool. if none is available, we will fork a new one
4153  assuming we are able to create a new one. this should be assured, as the
4154  caller should check on this first. */
4155 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4156  int new_tid) {
4157  kmp_team_t *serial_team;
4158  kmp_info_t *new_thr;
4159  int new_gtid;
4160 
4161  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4162  KMP_DEBUG_ASSERT(root && team);
4163 #if !KMP_NESTED_HOT_TEAMS
4164  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4165 #endif
4166  KMP_MB();
4167 
4168  /* first, try to get one from the thread pool */
4169  if (__kmp_thread_pool) {
4170 
4171  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4172  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4173  if (new_thr == __kmp_thread_pool_insert_pt) {
4174  __kmp_thread_pool_insert_pt = NULL;
4175  }
4176  TCW_4(new_thr->th.th_in_pool, FALSE);
4177  // Don't touch th_active_in_pool or th_active.
4178  // The worker thread adjusts those flags as it sleeps/awakens.
4179  __kmp_thread_pool_nth--;
4180 
4181  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4182  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4183  KMP_ASSERT(!new_thr->th.th_team);
4184  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4185  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4186 
4187  /* setup the thread structure */
4188  __kmp_initialize_info(new_thr, team, new_tid,
4189  new_thr->th.th_info.ds.ds_gtid);
4190  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4191 
4192  TCW_4(__kmp_nth, __kmp_nth + 1);
4193  root->r.r_cg_nthreads++;
4194 
4195  new_thr->th.th_task_state = 0;
4196  new_thr->th.th_task_state_top = 0;
4197  new_thr->th.th_task_state_stack_sz = 4;
4198 
4199 #ifdef KMP_ADJUST_BLOCKTIME
4200  /* Adjust blocktime back to zero if necessary */
4201  /* Middle initialization might not have occurred yet */
4202  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4203  if (__kmp_nth > __kmp_avail_proc) {
4204  __kmp_zero_bt = TRUE;
4205  }
4206  }
4207 #endif /* KMP_ADJUST_BLOCKTIME */
4208 
4209 #if KMP_DEBUG
4210  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4211  // KMP_BARRIER_PARENT_FLAG.
4212  int b;
4213  kmp_balign_t *balign = new_thr->th.th_bar;
4214  for (b = 0; b < bs_last_barrier; ++b)
4215  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4216 #endif
4217 
4218  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4219  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4220 
4221  KMP_MB();
4222  return new_thr;
4223  }
4224 
4225  /* no, well fork a new one */
4226  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4227  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4228 
4229 #if KMP_USE_MONITOR
4230  // If this is the first worker thread the RTL is creating, then also
4231  // launch the monitor thread. We try to do this as early as possible.
4232  if (!TCR_4(__kmp_init_monitor)) {
4233  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4234  if (!TCR_4(__kmp_init_monitor)) {
4235  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4236  TCW_4(__kmp_init_monitor, 1);
4237  __kmp_create_monitor(&__kmp_monitor);
4238  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4239 #if KMP_OS_WINDOWS
4240  // AC: wait until monitor has started. This is a fix for CQ232808.
4241  // The reason is that if the library is loaded/unloaded in a loop with
4242  // small (parallel) work in between, then there is high probability that
4243  // monitor thread started after the library shutdown. At shutdown it is
4244  // too late to cope with the problem, because when the master is in
4245  // DllMain (process detach) the monitor has no chances to start (it is
4246  // blocked), and master has no means to inform the monitor that the
4247  // library has gone, because all the memory which the monitor can access
4248  // is going to be released/reset.
4249  while (TCR_4(__kmp_init_monitor) < 2) {
4250  KMP_YIELD(TRUE);
4251  }
4252  KF_TRACE(10, ("after monitor thread has started\n"));
4253 #endif
4254  }
4255  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4256  }
4257 #endif
4258 
4259  KMP_MB();
4260  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4261  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4262  }
4263 
4264  /* allocate space for it. */
4265  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4266 
4267  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4268 
4269  if (__kmp_storage_map) {
4270  __kmp_print_thread_storage_map(new_thr, new_gtid);
4271  }
4272 
4273  // add the reserve serialized team, initialized from the team's master thread
4274  {
4275  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4276  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4277  new_thr->th.th_serial_team = serial_team =
4278  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4279 #if OMPT_SUPPORT
4280  ompt_data_none, // root parallel id
4281 #endif
4282 #if OMP_40_ENABLED
4283  proc_bind_default,
4284 #endif
4285  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4286  }
4287  KMP_ASSERT(serial_team);
4288  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4289  // execution (it is unused for now).
4290  serial_team->t.t_threads[0] = new_thr;
4291  KF_TRACE(10,
4292  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4293  new_thr));
4294 
4295  /* setup the thread structures */
4296  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4297 
4298 #if USE_FAST_MEMORY
4299  __kmp_initialize_fast_memory(new_thr);
4300 #endif /* USE_FAST_MEMORY */
4301 
4302 #if KMP_USE_BGET
4303  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4304  __kmp_initialize_bget(new_thr);
4305 #endif
4306 
4307  __kmp_init_random(new_thr); // Initialize random number generator
4308 
4309  /* Initialize these only once when thread is grabbed for a team allocation */
4310  KA_TRACE(20,
4311  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4312  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4313 
4314  int b;
4315  kmp_balign_t *balign = new_thr->th.th_bar;
4316  for (b = 0; b < bs_last_barrier; ++b) {
4317  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4318  balign[b].bb.team = NULL;
4319  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4320  balign[b].bb.use_oncore_barrier = 0;
4321  }
4322 
4323  new_thr->th.th_spin_here = FALSE;
4324  new_thr->th.th_next_waiting = 0;
4325 #if KMP_OS_UNIX
4326  new_thr->th.th_blocking = false;
4327 #endif
4328 
4329 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4330  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4331  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4332  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4333  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4334 #endif
4335 
4336  TCW_4(new_thr->th.th_in_pool, FALSE);
4337  new_thr->th.th_active_in_pool = FALSE;
4338  TCW_4(new_thr->th.th_active, TRUE);
4339 
4340  /* adjust the global counters */
4341  __kmp_all_nth++;
4342  __kmp_nth++;
4343 
4344  root->r.r_cg_nthreads++;
4345 
4346  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4347  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4348  if (__kmp_adjust_gtid_mode) {
4349  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4350  if (TCR_4(__kmp_gtid_mode) != 2) {
4351  TCW_4(__kmp_gtid_mode, 2);
4352  }
4353  } else {
4354  if (TCR_4(__kmp_gtid_mode) != 1) {
4355  TCW_4(__kmp_gtid_mode, 1);
4356  }
4357  }
4358  }
4359 
4360 #ifdef KMP_ADJUST_BLOCKTIME
4361  /* Adjust blocktime back to zero if necessary */
4362  /* Middle initialization might not have occurred yet */
4363  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4364  if (__kmp_nth > __kmp_avail_proc) {
4365  __kmp_zero_bt = TRUE;
4366  }
4367  }
4368 #endif /* KMP_ADJUST_BLOCKTIME */
4369 
4370  /* actually fork it and create the new worker thread */
4371  KF_TRACE(
4372  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4373  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4374  KF_TRACE(10,
4375  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4376 
4377  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4378  new_gtid));
4379  KMP_MB();
4380  return new_thr;
4381 }
4382 
4383 /* Reinitialize team for reuse.
4384  The hot team code calls this case at every fork barrier, so EPCC barrier
4385  test are extremely sensitive to changes in it, esp. writes to the team
4386  struct, which cause a cache invalidation in all threads.
4387  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4388 static void __kmp_reinitialize_team(kmp_team_t *team,
4389  kmp_internal_control_t *new_icvs,
4390  ident_t *loc) {
4391  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4392  team->t.t_threads[0], team));
4393  KMP_DEBUG_ASSERT(team && new_icvs);
4394  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4395  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4396 
4397  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4398  // Copy ICVs to the master thread's implicit taskdata
4399  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4400  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4401 
4402  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4403  team->t.t_threads[0], team));
4404 }
4405 
4406 /* Initialize the team data structure.
4407  This assumes the t_threads and t_max_nproc are already set.
4408  Also, we don't touch the arguments */
4409 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4410  kmp_internal_control_t *new_icvs,
4411  ident_t *loc) {
4412  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4413 
4414  /* verify */
4415  KMP_DEBUG_ASSERT(team);
4416  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4417  KMP_DEBUG_ASSERT(team->t.t_threads);
4418  KMP_MB();
4419 
4420  team->t.t_master_tid = 0; /* not needed */
4421  /* team->t.t_master_bar; not needed */
4422  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4423  team->t.t_nproc = new_nproc;
4424 
4425  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4426  team->t.t_next_pool = NULL;
4427  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4428  * up hot team */
4429 
4430  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4431  team->t.t_invoke = NULL; /* not needed */
4432 
4433  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4434  team->t.t_sched.sched = new_icvs->sched.sched;
4435 
4436 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4437  team->t.t_fp_control_saved = FALSE; /* not needed */
4438  team->t.t_x87_fpu_control_word = 0; /* not needed */
4439  team->t.t_mxcsr = 0; /* not needed */
4440 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4441 
4442  team->t.t_construct = 0;
4443 
4444  team->t.t_ordered.dt.t_value = 0;
4445  team->t.t_master_active = FALSE;
4446 
4447  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4448 
4449 #ifdef KMP_DEBUG
4450  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4451 #endif
4452 #if KMP_OS_WINDOWS
4453  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4454 #endif
4455 
4456  team->t.t_control_stack_top = NULL;
4457 
4458  __kmp_reinitialize_team(team, new_icvs, loc);
4459 
4460  KMP_MB();
4461  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4462 }
4463 
4464 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4465 /* Sets full mask for thread and returns old mask, no changes to structures. */
4466 static void
4467 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4468  if (KMP_AFFINITY_CAPABLE()) {
4469  int status;
4470  if (old_mask != NULL) {
4471  status = __kmp_get_system_affinity(old_mask, TRUE);
4472  int error = errno;
4473  if (status != 0) {
4474  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4475  __kmp_msg_null);
4476  }
4477  }
4478  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4479  }
4480 }
4481 #endif
4482 
4483 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4484 
4485 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4486 // It calculats the worker + master thread's partition based upon the parent
4487 // thread's partition, and binds each worker to a thread in their partition.
4488 // The master thread's partition should already include its current binding.
4489 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4490  // Copy the master thread's place partion to the team struct
4491  kmp_info_t *master_th = team->t.t_threads[0];
4492  KMP_DEBUG_ASSERT(master_th != NULL);
4493  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4494  int first_place = master_th->th.th_first_place;
4495  int last_place = master_th->th.th_last_place;
4496  int masters_place = master_th->th.th_current_place;
4497  team->t.t_first_place = first_place;
4498  team->t.t_last_place = last_place;
4499 
4500  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4501  "bound to place %d partition = [%d,%d]\n",
4502  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4503  team->t.t_id, masters_place, first_place, last_place));
4504 
4505  switch (proc_bind) {
4506 
4507  case proc_bind_default:
4508  // serial teams might have the proc_bind policy set to proc_bind_default. It
4509  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4510  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4511  break;
4512 
4513  case proc_bind_master: {
4514  int f;
4515  int n_th = team->t.t_nproc;
4516  for (f = 1; f < n_th; f++) {
4517  kmp_info_t *th = team->t.t_threads[f];
4518  KMP_DEBUG_ASSERT(th != NULL);
4519  th->th.th_first_place = first_place;
4520  th->th.th_last_place = last_place;
4521  th->th.th_new_place = masters_place;
4522 
4523  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4524  "partition = [%d,%d]\n",
4525  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4526  f, masters_place, first_place, last_place));
4527  }
4528  } break;
4529 
4530  case proc_bind_close: {
4531  int f;
4532  int n_th = team->t.t_nproc;
4533  int n_places;
4534  if (first_place <= last_place) {
4535  n_places = last_place - first_place + 1;
4536  } else {
4537  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4538  }
4539  if (n_th <= n_places) {
4540  int place = masters_place;
4541  for (f = 1; f < n_th; f++) {
4542  kmp_info_t *th = team->t.t_threads[f];
4543  KMP_DEBUG_ASSERT(th != NULL);
4544 
4545  if (place == last_place) {
4546  place = first_place;
4547  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4548  place = 0;
4549  } else {
4550  place++;
4551  }
4552  th->th.th_first_place = first_place;
4553  th->th.th_last_place = last_place;
4554  th->th.th_new_place = place;
4555 
4556  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4557  "partition = [%d,%d]\n",
4558  __kmp_gtid_from_thread(team->t.t_threads[f]),
4559  team->t.t_id, f, place, first_place, last_place));
4560  }
4561  } else {
4562  int S, rem, gap, s_count;
4563  S = n_th / n_places;
4564  s_count = 0;
4565  rem = n_th - (S * n_places);
4566  gap = rem > 0 ? n_places / rem : n_places;
4567  int place = masters_place;
4568  int gap_ct = gap;
4569  for (f = 0; f < n_th; f++) {
4570  kmp_info_t *th = team->t.t_threads[f];
4571  KMP_DEBUG_ASSERT(th != NULL);
4572 
4573  th->th.th_first_place = first_place;
4574  th->th.th_last_place = last_place;
4575  th->th.th_new_place = place;
4576  s_count++;
4577 
4578  if ((s_count == S) && rem && (gap_ct == gap)) {
4579  // do nothing, add an extra thread to place on next iteration
4580  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4581  // we added an extra thread to this place; move to next place
4582  if (place == last_place) {
4583  place = first_place;
4584  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4585  place = 0;
4586  } else {
4587  place++;
4588  }
4589  s_count = 0;
4590  gap_ct = 1;
4591  rem--;
4592  } else if (s_count == S) { // place full; don't add extra
4593  if (place == last_place) {
4594  place = first_place;
4595  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4596  place = 0;
4597  } else {
4598  place++;
4599  }
4600  gap_ct++;
4601  s_count = 0;
4602  }
4603 
4604  KA_TRACE(100,
4605  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4606  "partition = [%d,%d]\n",
4607  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4608  th->th.th_new_place, first_place, last_place));
4609  }
4610  KMP_DEBUG_ASSERT(place == masters_place);
4611  }
4612  } break;
4613 
4614  case proc_bind_spread: {
4615  int f;
4616  int n_th = team->t.t_nproc;
4617  int n_places;
4618  int thidx;
4619  if (first_place <= last_place) {
4620  n_places = last_place - first_place + 1;
4621  } else {
4622  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4623  }
4624  if (n_th <= n_places) {
4625  int place = -1;
4626 
4627  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4628  int S = n_places / n_th;
4629  int s_count, rem, gap, gap_ct;
4630 
4631  place = masters_place;
4632  rem = n_places - n_th * S;
4633  gap = rem ? n_th / rem : 1;
4634  gap_ct = gap;
4635  thidx = n_th;
4636  if (update_master_only == 1)
4637  thidx = 1;
4638  for (f = 0; f < thidx; f++) {
4639  kmp_info_t *th = team->t.t_threads[f];
4640  KMP_DEBUG_ASSERT(th != NULL);
4641 
4642  th->th.th_first_place = place;
4643  th->th.th_new_place = place;
4644  s_count = 1;
4645  while (s_count < S) {
4646  if (place == last_place) {
4647  place = first_place;
4648  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4649  place = 0;
4650  } else {
4651  place++;
4652  }
4653  s_count++;
4654  }
4655  if (rem && (gap_ct == gap)) {
4656  if (place == last_place) {
4657  place = first_place;
4658  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4659  place = 0;
4660  } else {
4661  place++;
4662  }
4663  rem--;
4664  gap_ct = 0;
4665  }
4666  th->th.th_last_place = place;
4667  gap_ct++;
4668 
4669  if (place == last_place) {
4670  place = first_place;
4671  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4672  place = 0;
4673  } else {
4674  place++;
4675  }
4676 
4677  KA_TRACE(100,
4678  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4679  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4680  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4681  f, th->th.th_new_place, th->th.th_first_place,
4682  th->th.th_last_place, __kmp_affinity_num_masks));
4683  }
4684  } else {
4685  /* Having uniform space of available computation places I can create
4686  T partitions of round(P/T) size and put threads into the first
4687  place of each partition. */
4688  double current = static_cast<double>(masters_place);
4689  double spacing =
4690  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4691  int first, last;
4692  kmp_info_t *th;
4693 
4694  thidx = n_th + 1;
4695  if (update_master_only == 1)
4696  thidx = 1;
4697  for (f = 0; f < thidx; f++) {
4698  first = static_cast<int>(current);
4699  last = static_cast<int>(current + spacing) - 1;
4700  KMP_DEBUG_ASSERT(last >= first);
4701  if (first >= n_places) {
4702  if (masters_place) {
4703  first -= n_places;
4704  last -= n_places;
4705  if (first == (masters_place + 1)) {
4706  KMP_DEBUG_ASSERT(f == n_th);
4707  first--;
4708  }
4709  if (last == masters_place) {
4710  KMP_DEBUG_ASSERT(f == (n_th - 1));
4711  last--;
4712  }
4713  } else {
4714  KMP_DEBUG_ASSERT(f == n_th);
4715  first = 0;
4716  last = 0;
4717  }
4718  }
4719  if (last >= n_places) {
4720  last = (n_places - 1);
4721  }
4722  place = first;
4723  current += spacing;
4724  if (f < n_th) {
4725  KMP_DEBUG_ASSERT(0 <= first);
4726  KMP_DEBUG_ASSERT(n_places > first);
4727  KMP_DEBUG_ASSERT(0 <= last);
4728  KMP_DEBUG_ASSERT(n_places > last);
4729  KMP_DEBUG_ASSERT(last_place >= first_place);
4730  th = team->t.t_threads[f];
4731  KMP_DEBUG_ASSERT(th);
4732  th->th.th_first_place = first;
4733  th->th.th_new_place = place;
4734  th->th.th_last_place = last;
4735 
4736  KA_TRACE(100,
4737  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4738  "partition = [%d,%d], spacing = %.4f\n",
4739  __kmp_gtid_from_thread(team->t.t_threads[f]),
4740  team->t.t_id, f, th->th.th_new_place,
4741  th->th.th_first_place, th->th.th_last_place, spacing));
4742  }
4743  }
4744  }
4745  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4746  } else {
4747  int S, rem, gap, s_count;
4748  S = n_th / n_places;
4749  s_count = 0;
4750  rem = n_th - (S * n_places);
4751  gap = rem > 0 ? n_places / rem : n_places;
4752  int place = masters_place;
4753  int gap_ct = gap;
4754  thidx = n_th;
4755  if (update_master_only == 1)
4756  thidx = 1;
4757  for (f = 0; f < thidx; f++) {
4758  kmp_info_t *th = team->t.t_threads[f];
4759  KMP_DEBUG_ASSERT(th != NULL);
4760 
4761  th->th.th_first_place = place;
4762  th->th.th_last_place = place;
4763  th->th.th_new_place = place;
4764  s_count++;
4765 
4766  if ((s_count == S) && rem && (gap_ct == gap)) {
4767  // do nothing, add an extra thread to place on next iteration
4768  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4769  // we added an extra thread to this place; move on to next place
4770  if (place == last_place) {
4771  place = first_place;
4772  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4773  place = 0;
4774  } else {
4775  place++;
4776  }
4777  s_count = 0;
4778  gap_ct = 1;
4779  rem--;
4780  } else if (s_count == S) { // place is full; don't add extra thread
4781  if (place == last_place) {
4782  place = first_place;
4783  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4784  place = 0;
4785  } else {
4786  place++;
4787  }
4788  gap_ct++;
4789  s_count = 0;
4790  }
4791 
4792  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4793  "partition = [%d,%d]\n",
4794  __kmp_gtid_from_thread(team->t.t_threads[f]),
4795  team->t.t_id, f, th->th.th_new_place,
4796  th->th.th_first_place, th->th.th_last_place));
4797  }
4798  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4799  }
4800  } break;
4801 
4802  default:
4803  break;
4804  }
4805 
4806  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4807 }
4808 
4809 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4810 
4811 /* allocate a new team data structure to use. take one off of the free pool if
4812  available */
4813 kmp_team_t *
4814 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4815 #if OMPT_SUPPORT
4816  ompt_data_t ompt_parallel_data,
4817 #endif
4818 #if OMP_40_ENABLED
4819  kmp_proc_bind_t new_proc_bind,
4820 #endif
4821  kmp_internal_control_t *new_icvs,
4822  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4823  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4824  int f;
4825  kmp_team_t *team;
4826  int use_hot_team = !root->r.r_active;
4827  int level = 0;
4828 
4829  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4830  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4831  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4832  KMP_MB();
4833 
4834 #if KMP_NESTED_HOT_TEAMS
4835  kmp_hot_team_ptr_t *hot_teams;
4836  if (master) {
4837  team = master->th.th_team;
4838  level = team->t.t_active_level;
4839  if (master->th.th_teams_microtask) { // in teams construct?
4840  if (master->th.th_teams_size.nteams > 1 &&
4841  ( // #teams > 1
4842  team->t.t_pkfn ==
4843  (microtask_t)__kmp_teams_master || // inner fork of the teams
4844  master->th.th_teams_level <
4845  team->t.t_level)) { // or nested parallel inside the teams
4846  ++level; // not increment if #teams==1, or for outer fork of the teams;
4847  // increment otherwise
4848  }
4849  }
4850  hot_teams = master->th.th_hot_teams;
4851  if (level < __kmp_hot_teams_max_level && hot_teams &&
4852  hot_teams[level]
4853  .hot_team) { // hot team has already been allocated for given level
4854  use_hot_team = 1;
4855  } else {
4856  use_hot_team = 0;
4857  }
4858  }
4859 #endif
4860  // Optimization to use a "hot" team
4861  if (use_hot_team && new_nproc > 1) {
4862  KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4863 #if KMP_NESTED_HOT_TEAMS
4864  team = hot_teams[level].hot_team;
4865 #else
4866  team = root->r.r_hot_team;
4867 #endif
4868 #if KMP_DEBUG
4869  if (__kmp_tasking_mode != tskm_immediate_exec) {
4870  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4871  "task_team[1] = %p before reinit\n",
4872  team->t.t_task_team[0], team->t.t_task_team[1]));
4873  }
4874 #endif
4875 
4876  // Has the number of threads changed?
4877  /* Let's assume the most common case is that the number of threads is
4878  unchanged, and put that case first. */
4879  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4880  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4881  // This case can mean that omp_set_num_threads() was called and the hot
4882  // team size was already reduced, so we check the special flag
4883  if (team->t.t_size_changed == -1) {
4884  team->t.t_size_changed = 1;
4885  } else {
4886  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4887  }
4888 
4889  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4890  kmp_r_sched_t new_sched = new_icvs->sched;
4891  // set master's schedule as new run-time schedule
4892  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4893 
4894  __kmp_reinitialize_team(team, new_icvs,
4895  root->r.r_uber_thread->th.th_ident);
4896 
4897  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4898  team->t.t_threads[0], team));
4899  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4900 
4901 #if OMP_40_ENABLED
4902 #if KMP_AFFINITY_SUPPORTED
4903  if ((team->t.t_size_changed == 0) &&
4904  (team->t.t_proc_bind == new_proc_bind)) {
4905  if (new_proc_bind == proc_bind_spread) {
4906  __kmp_partition_places(
4907  team, 1); // add flag to update only master for spread
4908  }
4909  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4910  "proc_bind = %d, partition = [%d,%d]\n",
4911  team->t.t_id, new_proc_bind, team->t.t_first_place,
4912  team->t.t_last_place));
4913  } else {
4914  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4915  __kmp_partition_places(team);
4916  }
4917 #else
4918  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4919 #endif /* KMP_AFFINITY_SUPPORTED */
4920 #endif /* OMP_40_ENABLED */
4921  } else if (team->t.t_nproc > new_nproc) {
4922  KA_TRACE(20,
4923  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4924  new_nproc));
4925 
4926  team->t.t_size_changed = 1;
4927 #if KMP_NESTED_HOT_TEAMS
4928  if (__kmp_hot_teams_mode == 0) {
4929  // AC: saved number of threads should correspond to team's value in this
4930  // mode, can be bigger in mode 1, when hot team has threads in reserve
4931  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4932  hot_teams[level].hot_team_nth = new_nproc;
4933 #endif // KMP_NESTED_HOT_TEAMS
4934  /* release the extra threads we don't need any more */
4935  for (f = new_nproc; f < team->t.t_nproc; f++) {
4936  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4937  if (__kmp_tasking_mode != tskm_immediate_exec) {
4938  // When decreasing team size, threads no longer in the team should
4939  // unref task team.
4940  team->t.t_threads[f]->th.th_task_team = NULL;
4941  }
4942  __kmp_free_thread(team->t.t_threads[f]);
4943  team->t.t_threads[f] = NULL;
4944  }
4945 #if KMP_NESTED_HOT_TEAMS
4946  } // (__kmp_hot_teams_mode == 0)
4947  else {
4948  // When keeping extra threads in team, switch threads to wait on own
4949  // b_go flag
4950  for (f = new_nproc; f < team->t.t_nproc; ++f) {
4951  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4952  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4953  for (int b = 0; b < bs_last_barrier; ++b) {
4954  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4955  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4956  }
4957  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4958  }
4959  }
4960  }
4961 #endif // KMP_NESTED_HOT_TEAMS
4962  team->t.t_nproc = new_nproc;
4963  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4964  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4965  __kmp_reinitialize_team(team, new_icvs,
4966  root->r.r_uber_thread->th.th_ident);
4967 
4968  /* update the remaining threads */
4969  for (f = 0; f < new_nproc; ++f) {
4970  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4971  }
4972  // restore the current task state of the master thread: should be the
4973  // implicit task
4974  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4975  team->t.t_threads[0], team));
4976 
4977  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4978 
4979 #ifdef KMP_DEBUG
4980  for (f = 0; f < team->t.t_nproc; f++) {
4981  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4982  team->t.t_threads[f]->th.th_team_nproc ==
4983  team->t.t_nproc);
4984  }
4985 #endif
4986 
4987 #if OMP_40_ENABLED
4988  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4989 #if KMP_AFFINITY_SUPPORTED
4990  __kmp_partition_places(team);
4991 #endif
4992 #endif
4993  } else { // team->t.t_nproc < new_nproc
4994 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4995  kmp_affin_mask_t *old_mask;
4996  if (KMP_AFFINITY_CAPABLE()) {
4997  KMP_CPU_ALLOC(old_mask);
4998  }
4999 #endif
5000 
5001  KA_TRACE(20,
5002  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5003  new_nproc));
5004 
5005  team->t.t_size_changed = 1;
5006 
5007 #if KMP_NESTED_HOT_TEAMS
5008  int avail_threads = hot_teams[level].hot_team_nth;
5009  if (new_nproc < avail_threads)
5010  avail_threads = new_nproc;
5011  kmp_info_t **other_threads = team->t.t_threads;
5012  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5013  // Adjust barrier data of reserved threads (if any) of the team
5014  // Other data will be set in __kmp_initialize_info() below.
5015  int b;
5016  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5017  for (b = 0; b < bs_last_barrier; ++b) {
5018  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5019  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5020 #if USE_DEBUGGER
5021  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5022 #endif
5023  }
5024  }
5025  if (hot_teams[level].hot_team_nth >= new_nproc) {
5026  // we have all needed threads in reserve, no need to allocate any
5027  // this only possible in mode 1, cannot have reserved threads in mode 0
5028  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5029  team->t.t_nproc = new_nproc; // just get reserved threads involved
5030  } else {
5031  // we may have some threads in reserve, but not enough
5032  team->t.t_nproc =
5033  hot_teams[level]
5034  .hot_team_nth; // get reserved threads involved if any
5035  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5036 #endif // KMP_NESTED_HOT_TEAMS
5037  if (team->t.t_max_nproc < new_nproc) {
5038  /* reallocate larger arrays */
5039  __kmp_reallocate_team_arrays(team, new_nproc);
5040  __kmp_reinitialize_team(team, new_icvs, NULL);
5041  }
5042 
5043 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5044  /* Temporarily set full mask for master thread before creation of
5045  workers. The reason is that workers inherit the affinity from master,
5046  so if a lot of workers are created on the single core quickly, they
5047  don't get a chance to set their own affinity for a long time. */
5048  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5049 #endif
5050 
5051  /* allocate new threads for the hot team */
5052  for (f = team->t.t_nproc; f < new_nproc; f++) {
5053  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5054  KMP_DEBUG_ASSERT(new_worker);
5055  team->t.t_threads[f] = new_worker;
5056 
5057  KA_TRACE(20,
5058  ("__kmp_allocate_team: team %d init T#%d arrived: "
5059  "join=%llu, plain=%llu\n",
5060  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5061  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5062  team->t.t_bar[bs_plain_barrier].b_arrived));
5063 
5064  { // Initialize barrier data for new threads.
5065  int b;
5066  kmp_balign_t *balign = new_worker->th.th_bar;
5067  for (b = 0; b < bs_last_barrier; ++b) {
5068  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5069  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5070  KMP_BARRIER_PARENT_FLAG);
5071 #if USE_DEBUGGER
5072  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5073 #endif
5074  }
5075  }
5076  }
5077 
5078 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5079  if (KMP_AFFINITY_CAPABLE()) {
5080  /* Restore initial master thread's affinity mask */
5081  __kmp_set_system_affinity(old_mask, TRUE);
5082  KMP_CPU_FREE(old_mask);
5083  }
5084 #endif
5085 #if KMP_NESTED_HOT_TEAMS
5086  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5087 #endif // KMP_NESTED_HOT_TEAMS
5088  /* make sure everyone is syncronized */
5089  int old_nproc = team->t.t_nproc; // save old value and use to update only
5090  // new threads below
5091  __kmp_initialize_team(team, new_nproc, new_icvs,
5092  root->r.r_uber_thread->th.th_ident);
5093 
5094  /* reinitialize the threads */
5095  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5096  for (f = 0; f < team->t.t_nproc; ++f)
5097  __kmp_initialize_info(team->t.t_threads[f], team, f,
5098  __kmp_gtid_from_tid(f, team));
5099  if (level) { // set th_task_state for new threads in nested hot team
5100  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5101  // only need to set the th_task_state for the new threads. th_task_state
5102  // for master thread will not be accurate until after this in
5103  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5104  // correct value.
5105  for (f = old_nproc; f < team->t.t_nproc; ++f)
5106  team->t.t_threads[f]->th.th_task_state =
5107  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5108  } else { // set th_task_state for new threads in non-nested hot team
5109  int old_state =
5110  team->t.t_threads[0]->th.th_task_state; // copy master's state
5111  for (f = old_nproc; f < team->t.t_nproc; ++f)
5112  team->t.t_threads[f]->th.th_task_state = old_state;
5113  }
5114 
5115 #ifdef KMP_DEBUG
5116  for (f = 0; f < team->t.t_nproc; ++f) {
5117  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5118  team->t.t_threads[f]->th.th_team_nproc ==
5119  team->t.t_nproc);
5120  }
5121 #endif
5122 
5123 #if OMP_40_ENABLED
5124  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5125 #if KMP_AFFINITY_SUPPORTED
5126  __kmp_partition_places(team);
5127 #endif
5128 #endif
5129  } // Check changes in number of threads
5130 
5131 #if OMP_40_ENABLED
5132  kmp_info_t *master = team->t.t_threads[0];
5133  if (master->th.th_teams_microtask) {
5134  for (f = 1; f < new_nproc; ++f) {
5135  // propagate teams construct specific info to workers
5136  kmp_info_t *thr = team->t.t_threads[f];
5137  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5138  thr->th.th_teams_level = master->th.th_teams_level;
5139  thr->th.th_teams_size = master->th.th_teams_size;
5140  }
5141  }
5142 #endif /* OMP_40_ENABLED */
5143 #if KMP_NESTED_HOT_TEAMS
5144  if (level) {
5145  // Sync barrier state for nested hot teams, not needed for outermost hot
5146  // team.
5147  for (f = 1; f < new_nproc; ++f) {
5148  kmp_info_t *thr = team->t.t_threads[f];
5149  int b;
5150  kmp_balign_t *balign = thr->th.th_bar;
5151  for (b = 0; b < bs_last_barrier; ++b) {
5152  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5153  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5154 #if USE_DEBUGGER
5155  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5156 #endif
5157  }
5158  }
5159  }
5160 #endif // KMP_NESTED_HOT_TEAMS
5161 
5162  /* reallocate space for arguments if necessary */
5163  __kmp_alloc_argv_entries(argc, team, TRUE);
5164  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5165  // The hot team re-uses the previous task team,
5166  // if untouched during the previous release->gather phase.
5167 
5168  KF_TRACE(10, (" hot_team = %p\n", team));
5169 
5170 #if KMP_DEBUG
5171  if (__kmp_tasking_mode != tskm_immediate_exec) {
5172  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5173  "task_team[1] = %p after reinit\n",
5174  team->t.t_task_team[0], team->t.t_task_team[1]));
5175  }
5176 #endif
5177 
5178 #if OMPT_SUPPORT
5179  __ompt_team_assign_id(team, ompt_parallel_data);
5180 #endif
5181 
5182  KMP_MB();
5183 
5184  return team;
5185  }
5186 
5187  /* next, let's try to take one from the team pool */
5188  KMP_MB();
5189  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5190  /* TODO: consider resizing undersized teams instead of reaping them, now
5191  that we have a resizing mechanism */
5192  if (team->t.t_max_nproc >= max_nproc) {
5193  /* take this team from the team pool */
5194  __kmp_team_pool = team->t.t_next_pool;
5195 
5196  /* setup the team for fresh use */
5197  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5198 
5199  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5200  "task_team[1] %p to NULL\n",
5201  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5202  team->t.t_task_team[0] = NULL;
5203  team->t.t_task_team[1] = NULL;
5204 
5205  /* reallocate space for arguments if necessary */
5206  __kmp_alloc_argv_entries(argc, team, TRUE);
5207  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5208 
5209  KA_TRACE(
5210  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5211  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5212  { // Initialize barrier data.
5213  int b;
5214  for (b = 0; b < bs_last_barrier; ++b) {
5215  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5216 #if USE_DEBUGGER
5217  team->t.t_bar[b].b_master_arrived = 0;
5218  team->t.t_bar[b].b_team_arrived = 0;
5219 #endif
5220  }
5221  }
5222 
5223 #if OMP_40_ENABLED
5224  team->t.t_proc_bind = new_proc_bind;
5225 #endif
5226 
5227  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5228  team->t.t_id));
5229 
5230 #if OMPT_SUPPORT
5231  __ompt_team_assign_id(team, ompt_parallel_data);
5232 #endif
5233 
5234  KMP_MB();
5235 
5236  return team;
5237  }
5238 
5239  /* reap team if it is too small, then loop back and check the next one */
5240  // not sure if this is wise, but, will be redone during the hot-teams
5241  // rewrite.
5242  /* TODO: Use technique to find the right size hot-team, don't reap them */
5243  team = __kmp_reap_team(team);
5244  __kmp_team_pool = team;
5245  }
5246 
5247  /* nothing available in the pool, no matter, make a new team! */
5248  KMP_MB();
5249  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5250 
5251  /* and set it up */
5252  team->t.t_max_nproc = max_nproc;
5253  /* NOTE well, for some reason allocating one big buffer and dividing it up
5254  seems to really hurt performance a lot on the P4, so, let's not use this */
5255  __kmp_allocate_team_arrays(team, max_nproc);
5256 
5257  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5258  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5259 
5260  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5261  "%p to NULL\n",
5262  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5263  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5264  // memory, no need to duplicate
5265  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5266  // memory, no need to duplicate
5267 
5268  if (__kmp_storage_map) {
5269  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5270  }
5271 
5272  /* allocate space for arguments */
5273  __kmp_alloc_argv_entries(argc, team, FALSE);
5274  team->t.t_argc = argc;
5275 
5276  KA_TRACE(20,
5277  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5278  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5279  { // Initialize barrier data.
5280  int b;
5281  for (b = 0; b < bs_last_barrier; ++b) {
5282  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5283 #if USE_DEBUGGER
5284  team->t.t_bar[b].b_master_arrived = 0;
5285  team->t.t_bar[b].b_team_arrived = 0;
5286 #endif
5287  }
5288  }
5289 
5290 #if OMP_40_ENABLED
5291  team->t.t_proc_bind = new_proc_bind;
5292 #endif
5293 
5294 #if OMPT_SUPPORT
5295  __ompt_team_assign_id(team, ompt_parallel_data);
5296  team->t.ompt_serialized_team_info = NULL;
5297 #endif
5298 
5299  KMP_MB();
5300 
5301  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5302  team->t.t_id));
5303 
5304  return team;
5305 }
5306 
5307 /* TODO implement hot-teams at all levels */
5308 /* TODO implement lazy thread release on demand (disband request) */
5309 
5310 /* free the team. return it to the team pool. release all the threads
5311  * associated with it */
5312 void __kmp_free_team(kmp_root_t *root,
5313  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5314  int f;
5315  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5316  team->t.t_id));
5317 
5318  /* verify state */
5319  KMP_DEBUG_ASSERT(root);
5320  KMP_DEBUG_ASSERT(team);
5321  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5322  KMP_DEBUG_ASSERT(team->t.t_threads);
5323 
5324  int use_hot_team = team == root->r.r_hot_team;
5325 #if KMP_NESTED_HOT_TEAMS
5326  int level;
5327  kmp_hot_team_ptr_t *hot_teams;
5328  if (master) {
5329  level = team->t.t_active_level - 1;
5330  if (master->th.th_teams_microtask) { // in teams construct?
5331  if (master->th.th_teams_size.nteams > 1) {
5332  ++level; // level was not increased in teams construct for
5333  // team_of_masters
5334  }
5335  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5336  master->th.th_teams_level == team->t.t_level) {
5337  ++level; // level was not increased in teams construct for
5338  // team_of_workers before the parallel
5339  } // team->t.t_level will be increased inside parallel
5340  }
5341  hot_teams = master->th.th_hot_teams;
5342  if (level < __kmp_hot_teams_max_level) {
5343  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5344  use_hot_team = 1;
5345  }
5346  }
5347 #endif // KMP_NESTED_HOT_TEAMS
5348 
5349  /* team is done working */
5350  TCW_SYNC_PTR(team->t.t_pkfn,
5351  NULL); // Important for Debugging Support Library.
5352 #if KMP_OS_WINDOWS
5353  team->t.t_copyin_counter = 0; // init counter for possible reuse
5354 #endif
5355  // Do not reset pointer to parent team to NULL for hot teams.
5356 
5357  /* if we are non-hot team, release our threads */
5358  if (!use_hot_team) {
5359  if (__kmp_tasking_mode != tskm_immediate_exec) {
5360  // Wait for threads to reach reapable state
5361  for (f = 1; f < team->t.t_nproc; ++f) {
5362  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5363  kmp_info_t *th = team->t.t_threads[f];
5364  volatile kmp_uint32 *state = &th->th.th_reap_state;
5365  while (*state != KMP_SAFE_TO_REAP) {
5366 #if KMP_OS_WINDOWS
5367  // On Windows a thread can be killed at any time, check this
5368  DWORD ecode;
5369  if (!__kmp_is_thread_alive(th, &ecode)) {
5370  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5371  break;
5372  }
5373 #endif
5374  // first check if thread is sleeping
5375  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5376  if (fl.is_sleeping())
5377  fl.resume(__kmp_gtid_from_thread(th));
5378  KMP_CPU_PAUSE();
5379  }
5380  }
5381 
5382  // Delete task teams
5383  int tt_idx;
5384  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5385  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5386  if (task_team != NULL) {
5387  for (f = 0; f < team->t.t_nproc;
5388  ++f) { // Have all threads unref task teams
5389  team->t.t_threads[f]->th.th_task_team = NULL;
5390  }
5391  KA_TRACE(
5392  20,
5393  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5394  __kmp_get_gtid(), task_team, team->t.t_id));
5395 #if KMP_NESTED_HOT_TEAMS
5396  __kmp_free_task_team(master, task_team);
5397 #endif
5398  team->t.t_task_team[tt_idx] = NULL;
5399  }
5400  }
5401  }
5402 
5403  // Reset pointer to parent team only for non-hot teams.
5404  team->t.t_parent = NULL;
5405  team->t.t_level = 0;
5406  team->t.t_active_level = 0;
5407 
5408  /* free the worker threads */
5409  for (f = 1; f < team->t.t_nproc; ++f) {
5410  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5411  __kmp_free_thread(team->t.t_threads[f]);
5412  team->t.t_threads[f] = NULL;
5413  }
5414 
5415  /* put the team back in the team pool */
5416  /* TODO limit size of team pool, call reap_team if pool too large */
5417  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5418  __kmp_team_pool = (volatile kmp_team_t *)team;
5419  }
5420 
5421  KMP_MB();
5422 }
5423 
5424 /* reap the team. destroy it, reclaim all its resources and free its memory */
5425 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5426  kmp_team_t *next_pool = team->t.t_next_pool;
5427 
5428  KMP_DEBUG_ASSERT(team);
5429  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5430  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5431  KMP_DEBUG_ASSERT(team->t.t_threads);
5432  KMP_DEBUG_ASSERT(team->t.t_argv);
5433 
5434  /* TODO clean the threads that are a part of this? */
5435 
5436  /* free stuff */
5437  __kmp_free_team_arrays(team);
5438  if (team->t.t_argv != &team->t.t_inline_argv[0])
5439  __kmp_free((void *)team->t.t_argv);
5440  __kmp_free(team);
5441 
5442  KMP_MB();
5443  return next_pool;
5444 }
5445 
5446 // Free the thread. Don't reap it, just place it on the pool of available
5447 // threads.
5448 //
5449 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5450 // binding for the affinity mechanism to be useful.
5451 //
5452 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5453 // However, we want to avoid a potential performance problem by always
5454 // scanning through the list to find the correct point at which to insert
5455 // the thread (potential N**2 behavior). To do this we keep track of the
5456 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5457 // With single-level parallelism, threads will always be added to the tail
5458 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5459 // parallelism, all bets are off and we may need to scan through the entire
5460 // free list.
5461 //
5462 // This change also has a potentially large performance benefit, for some
5463 // applications. Previously, as threads were freed from the hot team, they
5464 // would be placed back on the free list in inverse order. If the hot team
5465 // grew back to it's original size, then the freed thread would be placed
5466 // back on the hot team in reverse order. This could cause bad cache
5467 // locality problems on programs where the size of the hot team regularly
5468 // grew and shrunk.
5469 //
5470 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5471 void __kmp_free_thread(kmp_info_t *this_th) {
5472  int gtid;
5473  kmp_info_t **scan;
5474  kmp_root_t *root = this_th->th.th_root;
5475 
5476  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5477  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5478 
5479  KMP_DEBUG_ASSERT(this_th);
5480 
5481  // When moving thread to pool, switch thread to wait on own b_go flag, and
5482  // uninitialized (NULL team).
5483  int b;
5484  kmp_balign_t *balign = this_th->th.th_bar;
5485  for (b = 0; b < bs_last_barrier; ++b) {
5486  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5487  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5488  balign[b].bb.team = NULL;
5489  balign[b].bb.leaf_kids = 0;
5490  }
5491  this_th->th.th_task_state = 0;
5492  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5493 
5494  /* put thread back on the free pool */
5495  TCW_PTR(this_th->th.th_team, NULL);
5496  TCW_PTR(this_th->th.th_root, NULL);
5497  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5498 
5499  /* If the implicit task assigned to this thread can be used by other threads
5500  * -> multiple threads can share the data and try to free the task at
5501  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5502  * with higher probability when hot team is disabled but can occurs even when
5503  * the hot team is enabled */
5504  __kmp_free_implicit_task(this_th);
5505  this_th->th.th_current_task = NULL;
5506 
5507  // If the __kmp_thread_pool_insert_pt is already past the new insert
5508  // point, then we need to re-scan the entire list.
5509  gtid = this_th->th.th_info.ds.ds_gtid;
5510  if (__kmp_thread_pool_insert_pt != NULL) {
5511  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5512  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5513  __kmp_thread_pool_insert_pt = NULL;
5514  }
5515  }
5516 
5517  // Scan down the list to find the place to insert the thread.
5518  // scan is the address of a link in the list, possibly the address of
5519  // __kmp_thread_pool itself.
5520  //
5521  // In the absence of nested parallism, the for loop will have 0 iterations.
5522  if (__kmp_thread_pool_insert_pt != NULL) {
5523  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5524  } else {
5525  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5526  }
5527  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5528  scan = &((*scan)->th.th_next_pool))
5529  ;
5530 
5531  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5532  // to its address.
5533  TCW_PTR(this_th->th.th_next_pool, *scan);
5534  __kmp_thread_pool_insert_pt = *scan = this_th;
5535  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5536  (this_th->th.th_info.ds.ds_gtid <
5537  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5538  TCW_4(this_th->th.th_in_pool, TRUE);
5539  __kmp_thread_pool_nth++;
5540 
5541  TCW_4(__kmp_nth, __kmp_nth - 1);
5542  root->r.r_cg_nthreads--;
5543 
5544 #ifdef KMP_ADJUST_BLOCKTIME
5545  /* Adjust blocktime back to user setting or default if necessary */
5546  /* Middle initialization might never have occurred */
5547  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5548  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5549  if (__kmp_nth <= __kmp_avail_proc) {
5550  __kmp_zero_bt = FALSE;
5551  }
5552  }
5553 #endif /* KMP_ADJUST_BLOCKTIME */
5554 
5555  KMP_MB();
5556 }
5557 
5558 /* ------------------------------------------------------------------------ */
5559 
5560 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5561  int gtid = this_thr->th.th_info.ds.ds_gtid;
5562  /* void *stack_data;*/
5563  kmp_team_t *(*volatile pteam);
5564 
5565  KMP_MB();
5566  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5567 
5568  if (__kmp_env_consistency_check) {
5569  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5570  }
5571 
5572 #if OMPT_SUPPORT
5573  ompt_data_t *thread_data;
5574  if (ompt_enabled.enabled) {
5575  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5576  thread_data->ptr = NULL;
5577 
5578  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5579  this_thr->th.ompt_thread_info.wait_id = 0;
5580  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5581  if (ompt_enabled.ompt_callback_thread_begin) {
5582  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5583  ompt_thread_worker, thread_data);
5584  }
5585  }
5586 #endif
5587 
5588 #if OMPT_SUPPORT
5589  if (ompt_enabled.enabled) {
5590  this_thr->th.ompt_thread_info.state = omp_state_idle;
5591  }
5592 #endif
5593  /* This is the place where threads wait for work */
5594  while (!TCR_4(__kmp_global.g.g_done)) {
5595  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5596  KMP_MB();
5597 
5598  /* wait for work to do */
5599  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5600 
5601  /* No tid yet since not part of a team */
5602  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5603 
5604 #if OMPT_SUPPORT
5605  if (ompt_enabled.enabled) {
5606  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5607  }
5608 #endif
5609 
5610  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5611 
5612  /* have we been allocated? */
5613  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5614  /* we were just woken up, so run our new task */
5615  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5616  int rc;
5617  KA_TRACE(20,
5618  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5619  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5620  (*pteam)->t.t_pkfn));
5621 
5622  updateHWFPControl(*pteam);
5623 
5624 #if OMPT_SUPPORT
5625  if (ompt_enabled.enabled) {
5626  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5627  }
5628 #endif
5629 
5630  rc = (*pteam)->t.t_invoke(gtid);
5631  KMP_ASSERT(rc);
5632 
5633  KMP_MB();
5634  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5635  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5636  (*pteam)->t.t_pkfn));
5637  }
5638 #if OMPT_SUPPORT
5639  if (ompt_enabled.enabled) {
5640  /* no frame set while outside task */
5641  __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5642 
5643  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5644  }
5645 #endif
5646  /* join barrier after parallel region */
5647  __kmp_join_barrier(gtid);
5648  }
5649  }
5650  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5651 
5652 #if OMPT_SUPPORT
5653  if (ompt_enabled.ompt_callback_thread_end) {
5654  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5655  }
5656 #endif
5657 
5658  this_thr->th.th_task_team = NULL;
5659  /* run the destructors for the threadprivate data for this thread */
5660  __kmp_common_destroy_gtid(gtid);
5661 
5662  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5663  KMP_MB();
5664  return this_thr;
5665 }
5666 
5667 /* ------------------------------------------------------------------------ */
5668 
5669 void __kmp_internal_end_dest(void *specific_gtid) {
5670 #if KMP_COMPILER_ICC
5671 #pragma warning(push)
5672 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5673 // significant bits
5674 #endif
5675  // Make sure no significant bits are lost
5676  int gtid = (kmp_intptr_t)specific_gtid - 1;
5677 #if KMP_COMPILER_ICC
5678 #pragma warning(pop)
5679 #endif
5680 
5681  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5682  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5683  * this is because 0 is reserved for the nothing-stored case */
5684 
5685  /* josh: One reason for setting the gtid specific data even when it is being
5686  destroyed by pthread is to allow gtid lookup through thread specific data
5687  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5688  that gets executed in the call to __kmp_internal_end_thread, actually
5689  gets the gtid through the thread specific data. Setting it here seems
5690  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5691  to run smoothly.
5692  todo: get rid of this after we remove the dependence on
5693  __kmp_gtid_get_specific */
5694  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5695  __kmp_gtid_set_specific(gtid);
5696 #ifdef KMP_TDATA_GTID
5697  __kmp_gtid = gtid;
5698 #endif
5699  __kmp_internal_end_thread(gtid);
5700 }
5701 
5702 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5703 
5704 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5705 // destructors work perfectly, but in real libomp.so I have no evidence it is
5706 // ever called. However, -fini linker option in makefile.mk works fine.
5707 
5708 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5709  __kmp_internal_end_atexit();
5710 }
5711 
5712 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5713 
5714 #endif
5715 
5716 /* [Windows] josh: when the atexit handler is called, there may still be more
5717  than one thread alive */
5718 void __kmp_internal_end_atexit(void) {
5719  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5720  /* [Windows]
5721  josh: ideally, we want to completely shutdown the library in this atexit
5722  handler, but stat code that depends on thread specific data for gtid fails
5723  because that data becomes unavailable at some point during the shutdown, so
5724  we call __kmp_internal_end_thread instead. We should eventually remove the
5725  dependency on __kmp_get_specific_gtid in the stat code and use
5726  __kmp_internal_end_library to cleanly shutdown the library.
5727 
5728  // TODO: Can some of this comment about GVS be removed?
5729  I suspect that the offending stat code is executed when the calling thread
5730  tries to clean up a dead root thread's data structures, resulting in GVS
5731  code trying to close the GVS structures for that thread, but since the stat
5732  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5733  the calling thread is cleaning up itself instead of another thread, it get
5734  confused. This happens because allowing a thread to unregister and cleanup
5735  another thread is a recent modification for addressing an issue.
5736  Based on the current design (20050722), a thread may end up
5737  trying to unregister another thread only if thread death does not trigger
5738  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5739  thread specific data destructor function to detect thread death. For
5740  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5741  is nothing. Thus, the workaround is applicable only for Windows static
5742  stat library. */
5743  __kmp_internal_end_library(-1);
5744 #if KMP_OS_WINDOWS
5745  __kmp_close_console();
5746 #endif
5747 }
5748 
5749 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5750  // It is assumed __kmp_forkjoin_lock is acquired.
5751 
5752  int gtid;
5753 
5754  KMP_DEBUG_ASSERT(thread != NULL);
5755 
5756  gtid = thread->th.th_info.ds.ds_gtid;
5757 
5758  if (!is_root) {
5759 
5760  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5761  /* Assume the threads are at the fork barrier here */
5762  KA_TRACE(
5763  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5764  gtid));
5765  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5766  * (GEH) */
5767  ANNOTATE_HAPPENS_BEFORE(thread);
5768  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5769  __kmp_release_64(&flag);
5770  }
5771 
5772  // Terminate OS thread.
5773  __kmp_reap_worker(thread);
5774 
5775  // The thread was killed asynchronously. If it was actively
5776  // spinning in the thread pool, decrement the global count.
5777  //
5778  // There is a small timing hole here - if the worker thread was just waking
5779  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5780  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5781  // the global counter might not get updated.
5782  //
5783  // Currently, this can only happen as the library is unloaded,
5784  // so there are no harmful side effects.
5785  if (thread->th.th_active_in_pool) {
5786  thread->th.th_active_in_pool = FALSE;
5787  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5788  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5789  }
5790 
5791  // Decrement # of [worker] threads in the pool.
5792  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5793  --__kmp_thread_pool_nth;
5794  }
5795 
5796  __kmp_free_implicit_task(thread);
5797 
5798 // Free the fast memory for tasking
5799 #if USE_FAST_MEMORY
5800  __kmp_free_fast_memory(thread);
5801 #endif /* USE_FAST_MEMORY */
5802 
5803  __kmp_suspend_uninitialize_thread(thread);
5804 
5805  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5806  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5807 
5808  --__kmp_all_nth;
5809 // __kmp_nth was decremented when thread is added to the pool.
5810 
5811 #ifdef KMP_ADJUST_BLOCKTIME
5812  /* Adjust blocktime back to user setting or default if necessary */
5813  /* Middle initialization might never have occurred */
5814  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5815  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5816  if (__kmp_nth <= __kmp_avail_proc) {
5817  __kmp_zero_bt = FALSE;
5818  }
5819  }
5820 #endif /* KMP_ADJUST_BLOCKTIME */
5821 
5822  /* free the memory being used */
5823  if (__kmp_env_consistency_check) {
5824  if (thread->th.th_cons) {
5825  __kmp_free_cons_stack(thread->th.th_cons);
5826  thread->th.th_cons = NULL;
5827  }
5828  }
5829 
5830  if (thread->th.th_pri_common != NULL) {
5831  __kmp_free(thread->th.th_pri_common);
5832  thread->th.th_pri_common = NULL;
5833  }
5834 
5835  if (thread->th.th_task_state_memo_stack != NULL) {
5836  __kmp_free(thread->th.th_task_state_memo_stack);
5837  thread->th.th_task_state_memo_stack = NULL;
5838  }
5839 
5840 #if KMP_USE_BGET
5841  if (thread->th.th_local.bget_data != NULL) {
5842  __kmp_finalize_bget(thread);
5843  }
5844 #endif
5845 
5846 #if KMP_AFFINITY_SUPPORTED
5847  if (thread->th.th_affin_mask != NULL) {
5848  KMP_CPU_FREE(thread->th.th_affin_mask);
5849  thread->th.th_affin_mask = NULL;
5850  }
5851 #endif /* KMP_AFFINITY_SUPPORTED */
5852 
5853 #if KMP_USE_HIER_SCHED
5854  if (thread->th.th_hier_bar_data != NULL) {
5855  __kmp_free(thread->th.th_hier_bar_data);
5856  thread->th.th_hier_bar_data = NULL;
5857  }
5858 #endif
5859 
5860  __kmp_reap_team(thread->th.th_serial_team);
5861  thread->th.th_serial_team = NULL;
5862  __kmp_free(thread);
5863 
5864  KMP_MB();
5865 
5866 } // __kmp_reap_thread
5867 
5868 static void __kmp_internal_end(void) {
5869  int i;
5870 
5871  /* First, unregister the library */
5872  __kmp_unregister_library();
5873 
5874 #if KMP_OS_WINDOWS
5875  /* In Win static library, we can't tell when a root actually dies, so we
5876  reclaim the data structures for any root threads that have died but not
5877  unregistered themselves, in order to shut down cleanly.
5878  In Win dynamic library we also can't tell when a thread dies. */
5879  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5880 // dead roots
5881 #endif
5882 
5883  for (i = 0; i < __kmp_threads_capacity; i++)
5884  if (__kmp_root[i])
5885  if (__kmp_root[i]->r.r_active)
5886  break;
5887  KMP_MB(); /* Flush all pending memory write invalidates. */
5888  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5889 
5890  if (i < __kmp_threads_capacity) {
5891 #if KMP_USE_MONITOR
5892  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5893  KMP_MB(); /* Flush all pending memory write invalidates. */
5894 
5895  // Need to check that monitor was initialized before reaping it. If we are
5896  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5897  // __kmp_monitor will appear to contain valid data, but it is only valid in
5898  // the parent process, not the child.
5899  // New behavior (201008): instead of keying off of the flag
5900  // __kmp_init_parallel, the monitor thread creation is keyed off
5901  // of the new flag __kmp_init_monitor.
5902  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5903  if (TCR_4(__kmp_init_monitor)) {
5904  __kmp_reap_monitor(&__kmp_monitor);
5905  TCW_4(__kmp_init_monitor, 0);
5906  }
5907  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5908  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5909 #endif // KMP_USE_MONITOR
5910  } else {
5911 /* TODO move this to cleanup code */
5912 #ifdef KMP_DEBUG
5913  /* make sure that everything has properly ended */
5914  for (i = 0; i < __kmp_threads_capacity; i++) {
5915  if (__kmp_root[i]) {
5916  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5917  // there can be uber threads alive here
5918  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5919  }
5920  }
5921 #endif
5922 
5923  KMP_MB();
5924 
5925  // Reap the worker threads.
5926  // This is valid for now, but be careful if threads are reaped sooner.
5927  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5928  // Get the next thread from the pool.
5929  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5930  __kmp_thread_pool = thread->th.th_next_pool;
5931  // Reap it.
5932  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5933  thread->th.th_next_pool = NULL;
5934  thread->th.th_in_pool = FALSE;
5935  __kmp_reap_thread(thread, 0);
5936  }
5937  __kmp_thread_pool_insert_pt = NULL;
5938 
5939  // Reap teams.
5940  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5941  // Get the next team from the pool.
5942  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5943  __kmp_team_pool = team->t.t_next_pool;
5944  // Reap it.
5945  team->t.t_next_pool = NULL;
5946  __kmp_reap_team(team);
5947  }
5948 
5949  __kmp_reap_task_teams();
5950 
5951 #if KMP_OS_UNIX
5952  // Threads that are not reaped should not access any resources since they
5953  // are going to be deallocated soon, so the shutdown sequence should wait
5954  // until all threads either exit the final spin-waiting loop or begin
5955  // sleeping after the given blocktime.
5956  for (i = 0; i < __kmp_threads_capacity; i++) {
5957  kmp_info_t *thr = __kmp_threads[i];
5958  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
5959  KMP_CPU_PAUSE();
5960  }
5961 #endif
5962 
5963  for (i = 0; i < __kmp_threads_capacity; ++i) {
5964  // TBD: Add some checking...
5965  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5966  }
5967 
5968  /* Make sure all threadprivate destructors get run by joining with all
5969  worker threads before resetting this flag */
5970  TCW_SYNC_4(__kmp_init_common, FALSE);
5971 
5972  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5973  KMP_MB();
5974 
5975 #if KMP_USE_MONITOR
5976  // See note above: One of the possible fixes for CQ138434 / CQ140126
5977  //
5978  // FIXME: push both code fragments down and CSE them?
5979  // push them into __kmp_cleanup() ?
5980  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5981  if (TCR_4(__kmp_init_monitor)) {
5982  __kmp_reap_monitor(&__kmp_monitor);
5983  TCW_4(__kmp_init_monitor, 0);
5984  }
5985  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5986  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5987 #endif
5988  } /* else !__kmp_global.t_active */
5989  TCW_4(__kmp_init_gtid, FALSE);
5990  KMP_MB(); /* Flush all pending memory write invalidates. */
5991 
5992  __kmp_cleanup();
5993 #if OMPT_SUPPORT
5994  ompt_fini();
5995 #endif
5996 }
5997 
5998 void __kmp_internal_end_library(int gtid_req) {
5999  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6000  /* this shouldn't be a race condition because __kmp_internal_end() is the
6001  only place to clear __kmp_serial_init */
6002  /* we'll check this later too, after we get the lock */
6003  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6004  // redundaant, because the next check will work in any case.
6005  if (__kmp_global.g.g_abort) {
6006  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6007  /* TODO abort? */
6008  return;
6009  }
6010  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6011  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6012  return;
6013  }
6014 
6015  KMP_MB(); /* Flush all pending memory write invalidates. */
6016 
6017  /* find out who we are and what we should do */
6018  {
6019  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6020  KA_TRACE(
6021  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6022  if (gtid == KMP_GTID_SHUTDOWN) {
6023  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6024  "already shutdown\n"));
6025  return;
6026  } else if (gtid == KMP_GTID_MONITOR) {
6027  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6028  "registered, or system shutdown\n"));
6029  return;
6030  } else if (gtid == KMP_GTID_DNE) {
6031  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6032  "shutdown\n"));
6033  /* we don't know who we are, but we may still shutdown the library */
6034  } else if (KMP_UBER_GTID(gtid)) {
6035  /* unregister ourselves as an uber thread. gtid is no longer valid */
6036  if (__kmp_root[gtid]->r.r_active) {
6037  __kmp_global.g.g_abort = -1;
6038  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6039  KA_TRACE(10,
6040  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6041  gtid));
6042  return;
6043  } else {
6044  KA_TRACE(
6045  10,
6046  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6047  __kmp_unregister_root_current_thread(gtid);
6048  }
6049  } else {
6050 /* worker threads may call this function through the atexit handler, if they
6051  * call exit() */
6052 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6053  TODO: do a thorough shutdown instead */
6054 #ifdef DUMP_DEBUG_ON_EXIT
6055  if (__kmp_debug_buf)
6056  __kmp_dump_debug_buffer();
6057 #endif
6058  return;
6059  }
6060  }
6061  /* synchronize the termination process */
6062  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6063 
6064  /* have we already finished */
6065  if (__kmp_global.g.g_abort) {
6066  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6067  /* TODO abort? */
6068  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6069  return;
6070  }
6071  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6072  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6073  return;
6074  }
6075 
6076  /* We need this lock to enforce mutex between this reading of
6077  __kmp_threads_capacity and the writing by __kmp_register_root.
6078  Alternatively, we can use a counter of roots that is atomically updated by
6079  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6080  __kmp_internal_end_*. */
6081  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6082 
6083  /* now we can safely conduct the actual termination */
6084  __kmp_internal_end();
6085 
6086  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6087  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6088 
6089  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6090 
6091 #ifdef DUMP_DEBUG_ON_EXIT
6092  if (__kmp_debug_buf)
6093  __kmp_dump_debug_buffer();
6094 #endif
6095 
6096 #if KMP_OS_WINDOWS
6097  __kmp_close_console();
6098 #endif
6099 
6100  __kmp_fini_allocator();
6101 
6102 } // __kmp_internal_end_library
6103 
6104 void __kmp_internal_end_thread(int gtid_req) {
6105  int i;
6106 
6107  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6108  /* this shouldn't be a race condition because __kmp_internal_end() is the
6109  * only place to clear __kmp_serial_init */
6110  /* we'll check this later too, after we get the lock */
6111  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6112  // redundant, because the next check will work in any case.
6113  if (__kmp_global.g.g_abort) {
6114  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6115  /* TODO abort? */
6116  return;
6117  }
6118  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6119  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6120  return;
6121  }
6122 
6123  KMP_MB(); /* Flush all pending memory write invalidates. */
6124 
6125  /* find out who we are and what we should do */
6126  {
6127  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6128  KA_TRACE(10,
6129  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6130  if (gtid == KMP_GTID_SHUTDOWN) {
6131  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6132  "already shutdown\n"));
6133  return;
6134  } else if (gtid == KMP_GTID_MONITOR) {
6135  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6136  "registered, or system shutdown\n"));
6137  return;
6138  } else if (gtid == KMP_GTID_DNE) {
6139  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6140  "shutdown\n"));
6141  return;
6142  /* we don't know who we are */
6143  } else if (KMP_UBER_GTID(gtid)) {
6144  /* unregister ourselves as an uber thread. gtid is no longer valid */
6145  if (__kmp_root[gtid]->r.r_active) {
6146  __kmp_global.g.g_abort = -1;
6147  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6148  KA_TRACE(10,
6149  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6150  gtid));
6151  return;
6152  } else {
6153  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6154  gtid));
6155  __kmp_unregister_root_current_thread(gtid);
6156  }
6157  } else {
6158  /* just a worker thread, let's leave */
6159  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6160 
6161  if (gtid >= 0) {
6162  __kmp_threads[gtid]->th.th_task_team = NULL;
6163  }
6164 
6165  KA_TRACE(10,
6166  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6167  gtid));
6168  return;
6169  }
6170  }
6171 #if defined KMP_DYNAMIC_LIB
6172  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6173  // thread, because we will better shutdown later in the library destructor.
6174  // The reason of this change is performance problem when non-openmp thread in
6175  // a loop forks and joins many openmp threads. We can save a lot of time
6176  // keeping worker threads alive until the program shutdown.
6177  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6178  // and Windows(DPD200287443) that occurs when using critical sections from
6179  // foreign threads.
6180  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6181  return;
6182 #endif
6183  /* synchronize the termination process */
6184  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6185 
6186  /* have we already finished */
6187  if (__kmp_global.g.g_abort) {
6188  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6189  /* TODO abort? */
6190  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6191  return;
6192  }
6193  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6194  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6195  return;
6196  }
6197 
6198  /* We need this lock to enforce mutex between this reading of
6199  __kmp_threads_capacity and the writing by __kmp_register_root.
6200  Alternatively, we can use a counter of roots that is atomically updated by
6201  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6202  __kmp_internal_end_*. */
6203 
6204  /* should we finish the run-time? are all siblings done? */
6205  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6206 
6207  for (i = 0; i < __kmp_threads_capacity; ++i) {
6208  if (KMP_UBER_GTID(i)) {
6209  KA_TRACE(
6210  10,
6211  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6212  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6213  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6214  return;
6215  }
6216  }
6217 
6218  /* now we can safely conduct the actual termination */
6219 
6220  __kmp_internal_end();
6221 
6222  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6223  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6224 
6225  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6226 
6227 #ifdef DUMP_DEBUG_ON_EXIT
6228  if (__kmp_debug_buf)
6229  __kmp_dump_debug_buffer();
6230 #endif
6231 } // __kmp_internal_end_thread
6232 
6233 // -----------------------------------------------------------------------------
6234 // Library registration stuff.
6235 
6236 static long __kmp_registration_flag = 0;
6237 // Random value used to indicate library initialization.
6238 static char *__kmp_registration_str = NULL;
6239 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6240 
6241 static inline char *__kmp_reg_status_name() {
6242  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6243  each thread. If registration and unregistration go in different threads
6244  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6245  env var can not be found, because the name will contain different pid. */
6246  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6247 } // __kmp_reg_status_get
6248 
6249 void __kmp_register_library_startup(void) {
6250 
6251  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6252  int done = 0;
6253  union {
6254  double dtime;
6255  long ltime;
6256  } time;
6257 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6258  __kmp_initialize_system_tick();
6259 #endif
6260  __kmp_read_system_time(&time.dtime);
6261  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6262  __kmp_registration_str =
6263  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6264  __kmp_registration_flag, KMP_LIBRARY_FILE);
6265 
6266  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6267  __kmp_registration_str));
6268 
6269  while (!done) {
6270 
6271  char *value = NULL; // Actual value of the environment variable.
6272 
6273  // Set environment variable, but do not overwrite if it is exist.
6274  __kmp_env_set(name, __kmp_registration_str, 0);
6275  // Check the variable is written.
6276  value = __kmp_env_get(name);
6277  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6278 
6279  done = 1; // Ok, environment variable set successfully, exit the loop.
6280 
6281  } else {
6282 
6283  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6284  // Check whether it alive or dead.
6285  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6286  char *tail = value;
6287  char *flag_addr_str = NULL;
6288  char *flag_val_str = NULL;
6289  char const *file_name = NULL;
6290  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6291  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6292  file_name = tail;
6293  if (tail != NULL) {
6294  long *flag_addr = 0;
6295  long flag_val = 0;
6296  KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6297  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6298  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6299  // First, check whether environment-encoded address is mapped into
6300  // addr space.
6301  // If so, dereference it to see if it still has the right value.
6302  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6303  neighbor = 1;
6304  } else {
6305  // If not, then we know the other copy of the library is no longer
6306  // running.
6307  neighbor = 2;
6308  }
6309  }
6310  }
6311  switch (neighbor) {
6312  case 0: // Cannot parse environment variable -- neighbor status unknown.
6313  // Assume it is the incompatible format of future version of the
6314  // library. Assume the other library is alive.
6315  // WARN( ... ); // TODO: Issue a warning.
6316  file_name = "unknown library";
6317  // Attention! Falling to the next case. That's intentional.
6318  case 1: { // Neighbor is alive.
6319  // Check it is allowed.
6320  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6321  if (!__kmp_str_match_true(duplicate_ok)) {
6322  // That's not allowed. Issue fatal error.
6323  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6324  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6325  }
6326  KMP_INTERNAL_FREE(duplicate_ok);
6327  __kmp_duplicate_library_ok = 1;
6328  done = 1; // Exit the loop.
6329  } break;
6330  case 2: { // Neighbor is dead.
6331  // Clear the variable and try to register library again.
6332  __kmp_env_unset(name);
6333  } break;
6334  default: { KMP_DEBUG_ASSERT(0); } break;
6335  }
6336  }
6337  KMP_INTERNAL_FREE((void *)value);
6338  }
6339  KMP_INTERNAL_FREE((void *)name);
6340 
6341 } // func __kmp_register_library_startup
6342 
6343 void __kmp_unregister_library(void) {
6344 
6345  char *name = __kmp_reg_status_name();
6346  char *value = __kmp_env_get(name);
6347 
6348  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6349  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6350  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6351  // Ok, this is our variable. Delete it.
6352  __kmp_env_unset(name);
6353  }
6354 
6355  KMP_INTERNAL_FREE(__kmp_registration_str);
6356  KMP_INTERNAL_FREE(value);
6357  KMP_INTERNAL_FREE(name);
6358 
6359  __kmp_registration_flag = 0;
6360  __kmp_registration_str = NULL;
6361 
6362 } // __kmp_unregister_library
6363 
6364 // End of Library registration stuff.
6365 // -----------------------------------------------------------------------------
6366 
6367 #if KMP_MIC_SUPPORTED
6368 
6369 static void __kmp_check_mic_type() {
6370  kmp_cpuid_t cpuid_state = {0};
6371  kmp_cpuid_t *cs_p = &cpuid_state;
6372  __kmp_x86_cpuid(1, 0, cs_p);
6373  // We don't support mic1 at the moment
6374  if ((cs_p->eax & 0xff0) == 0xB10) {
6375  __kmp_mic_type = mic2;
6376  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6377  __kmp_mic_type = mic3;
6378  } else {
6379  __kmp_mic_type = non_mic;
6380  }
6381 }
6382 
6383 #endif /* KMP_MIC_SUPPORTED */
6384 
6385 static void __kmp_do_serial_initialize(void) {
6386  int i, gtid;
6387  int size;
6388 
6389  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6390 
6391  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6392  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6393  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6394  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6395  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6396 
6397 #if OMPT_SUPPORT
6398  ompt_pre_init();
6399 #endif
6400 
6401  __kmp_validate_locks();
6402 
6403  /* Initialize internal memory allocator */
6404  __kmp_init_allocator();
6405 
6406  /* Register the library startup via an environment variable and check to see
6407  whether another copy of the library is already registered. */
6408 
6409  __kmp_register_library_startup();
6410 
6411  /* TODO reinitialization of library */
6412  if (TCR_4(__kmp_global.g.g_done)) {
6413  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6414  }
6415 
6416  __kmp_global.g.g_abort = 0;
6417  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6418 
6419 /* initialize the locks */
6420 #if KMP_USE_ADAPTIVE_LOCKS
6421 #if KMP_DEBUG_ADAPTIVE_LOCKS
6422  __kmp_init_speculative_stats();
6423 #endif
6424 #endif
6425 #if KMP_STATS_ENABLED
6426  __kmp_stats_init();
6427 #endif
6428  __kmp_init_lock(&__kmp_global_lock);
6429  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6430  __kmp_init_lock(&__kmp_debug_lock);
6431  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6432  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6433  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6434  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6435  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6436  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6437  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6438  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6439  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6440  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6441  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6442  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6443  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6444  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6445  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6446 #if KMP_USE_MONITOR
6447  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6448 #endif
6449  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6450 
6451  /* conduct initialization and initial setup of configuration */
6452 
6453  __kmp_runtime_initialize();
6454 
6455 #if KMP_MIC_SUPPORTED
6456  __kmp_check_mic_type();
6457 #endif
6458 
6459 // Some global variable initialization moved here from kmp_env_initialize()
6460 #ifdef KMP_DEBUG
6461  kmp_diag = 0;
6462 #endif
6463  __kmp_abort_delay = 0;
6464 
6465  // From __kmp_init_dflt_team_nth()
6466  /* assume the entire machine will be used */
6467  __kmp_dflt_team_nth_ub = __kmp_xproc;
6468  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6469  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6470  }
6471  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6472  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6473  }
6474  __kmp_max_nth = __kmp_sys_max_nth;
6475  __kmp_cg_max_nth = __kmp_sys_max_nth;
6476  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6477  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6478  __kmp_teams_max_nth = __kmp_sys_max_nth;
6479  }
6480 
6481  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6482  // part
6483  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6484 #if KMP_USE_MONITOR
6485  __kmp_monitor_wakeups =
6486  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6487  __kmp_bt_intervals =
6488  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6489 #endif
6490  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6491  __kmp_library = library_throughput;
6492  // From KMP_SCHEDULE initialization
6493  __kmp_static = kmp_sch_static_balanced;
6494 // AC: do not use analytical here, because it is non-monotonous
6495 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6496 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6497 // need to repeat assignment
6498 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6499 // bit control and barrier method control parts
6500 #if KMP_FAST_REDUCTION_BARRIER
6501 #define kmp_reduction_barrier_gather_bb ((int)1)
6502 #define kmp_reduction_barrier_release_bb ((int)1)
6503 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6504 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6505 #endif // KMP_FAST_REDUCTION_BARRIER
6506  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6507  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6508  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6509  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6510  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6511 #if KMP_FAST_REDUCTION_BARRIER
6512  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6513  // lin_64 ): hyper,1
6514  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6515  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6516  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6517  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6518  }
6519 #endif // KMP_FAST_REDUCTION_BARRIER
6520  }
6521 #if KMP_FAST_REDUCTION_BARRIER
6522 #undef kmp_reduction_barrier_release_pat
6523 #undef kmp_reduction_barrier_gather_pat
6524 #undef kmp_reduction_barrier_release_bb
6525 #undef kmp_reduction_barrier_gather_bb
6526 #endif // KMP_FAST_REDUCTION_BARRIER
6527 #if KMP_MIC_SUPPORTED
6528  if (__kmp_mic_type == mic2) { // KNC
6529  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6530  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6531  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6532  1; // forkjoin release
6533  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6534  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6535  }
6536 #if KMP_FAST_REDUCTION_BARRIER
6537  if (__kmp_mic_type == mic2) { // KNC
6538  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6539  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6540  }
6541 #endif // KMP_FAST_REDUCTION_BARRIER
6542 #endif // KMP_MIC_SUPPORTED
6543 
6544 // From KMP_CHECKS initialization
6545 #ifdef KMP_DEBUG
6546  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6547 #else
6548  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6549 #endif
6550 
6551  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6552  __kmp_foreign_tp = TRUE;
6553 
6554  __kmp_global.g.g_dynamic = FALSE;
6555  __kmp_global.g.g_dynamic_mode = dynamic_default;
6556 
6557  __kmp_env_initialize(NULL);
6558 
6559 // Print all messages in message catalog for testing purposes.
6560 #ifdef KMP_DEBUG
6561  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6562  if (__kmp_str_match_true(val)) {
6563  kmp_str_buf_t buffer;
6564  __kmp_str_buf_init(&buffer);
6565  __kmp_i18n_dump_catalog(&buffer);
6566  __kmp_printf("%s", buffer.str);
6567  __kmp_str_buf_free(&buffer);
6568  }
6569  __kmp_env_free(&val);
6570 #endif
6571 
6572  __kmp_threads_capacity =
6573  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6574  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6575  __kmp_tp_capacity = __kmp_default_tp_capacity(
6576  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6577 
6578  // If the library is shut down properly, both pools must be NULL. Just in
6579  // case, set them to NULL -- some memory may leak, but subsequent code will
6580  // work even if pools are not freed.
6581  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6582  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6583  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6584  __kmp_thread_pool = NULL;
6585  __kmp_thread_pool_insert_pt = NULL;
6586  __kmp_team_pool = NULL;
6587 
6588  /* Allocate all of the variable sized records */
6589  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6590  * expandable */
6591  /* Since allocation is cache-aligned, just add extra padding at the end */
6592  size =
6593  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6594  CACHE_LINE;
6595  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6596  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6597  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6598 
6599  /* init thread counts */
6600  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6601  0); // Asserts fail if the library is reinitializing and
6602  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6603  __kmp_all_nth = 0;
6604  __kmp_nth = 0;
6605 
6606  /* setup the uber master thread and hierarchy */
6607  gtid = __kmp_register_root(TRUE);
6608  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6609  KMP_ASSERT(KMP_UBER_GTID(gtid));
6610  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6611 
6612  KMP_MB(); /* Flush all pending memory write invalidates. */
6613 
6614  __kmp_common_initialize();
6615 
6616 #if KMP_OS_UNIX
6617  /* invoke the child fork handler */
6618  __kmp_register_atfork();
6619 #endif
6620 
6621 #if !defined KMP_DYNAMIC_LIB
6622  {
6623  /* Invoke the exit handler when the program finishes, only for static
6624  library. For dynamic library, we already have _fini and DllMain. */
6625  int rc = atexit(__kmp_internal_end_atexit);
6626  if (rc != 0) {
6627  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6628  __kmp_msg_null);
6629  }
6630  }
6631 #endif
6632 
6633 #if KMP_HANDLE_SIGNALS
6634 #if KMP_OS_UNIX
6635  /* NOTE: make sure that this is called before the user installs their own
6636  signal handlers so that the user handlers are called first. this way they
6637  can return false, not call our handler, avoid terminating the library, and
6638  continue execution where they left off. */
6639  __kmp_install_signals(FALSE);
6640 #endif /* KMP_OS_UNIX */
6641 #if KMP_OS_WINDOWS
6642  __kmp_install_signals(TRUE);
6643 #endif /* KMP_OS_WINDOWS */
6644 #endif
6645 
6646  /* we have finished the serial initialization */
6647  __kmp_init_counter++;
6648 
6649  __kmp_init_serial = TRUE;
6650 
6651  if (__kmp_settings) {
6652  __kmp_env_print();
6653  }
6654 
6655 #if OMP_40_ENABLED
6656  if (__kmp_display_env || __kmp_display_env_verbose) {
6657  __kmp_env_print_2();
6658  }
6659 #endif // OMP_40_ENABLED
6660 
6661 #if OMPT_SUPPORT
6662  ompt_post_init();
6663 #endif
6664 
6665  KMP_MB();
6666 
6667  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6668 }
6669 
6670 void __kmp_serial_initialize(void) {
6671  if (__kmp_init_serial) {
6672  return;
6673  }
6674  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6675  if (__kmp_init_serial) {
6676  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6677  return;
6678  }
6679  __kmp_do_serial_initialize();
6680  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6681 }
6682 
6683 static void __kmp_do_middle_initialize(void) {
6684  int i, j;
6685  int prev_dflt_team_nth;
6686 
6687  if (!__kmp_init_serial) {
6688  __kmp_do_serial_initialize();
6689  }
6690 
6691  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6692 
6693  // Save the previous value for the __kmp_dflt_team_nth so that
6694  // we can avoid some reinitialization if it hasn't changed.
6695  prev_dflt_team_nth = __kmp_dflt_team_nth;
6696 
6697 #if KMP_AFFINITY_SUPPORTED
6698  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6699  // number of cores on the machine.
6700  __kmp_affinity_initialize();
6701 
6702  // Run through the __kmp_threads array and set the affinity mask
6703  // for each root thread that is currently registered with the RTL.
6704  for (i = 0; i < __kmp_threads_capacity; i++) {
6705  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6706  __kmp_affinity_set_init_mask(i, TRUE);
6707  }
6708  }
6709 #endif /* KMP_AFFINITY_SUPPORTED */
6710 
6711  KMP_ASSERT(__kmp_xproc > 0);
6712  if (__kmp_avail_proc == 0) {
6713  __kmp_avail_proc = __kmp_xproc;
6714  }
6715 
6716  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6717  // correct them now
6718  j = 0;
6719  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6720  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6721  __kmp_avail_proc;
6722  j++;
6723  }
6724 
6725  if (__kmp_dflt_team_nth == 0) {
6726 #ifdef KMP_DFLT_NTH_CORES
6727  // Default #threads = #cores
6728  __kmp_dflt_team_nth = __kmp_ncores;
6729  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6730  "__kmp_ncores (%d)\n",
6731  __kmp_dflt_team_nth));
6732 #else
6733  // Default #threads = #available OS procs
6734  __kmp_dflt_team_nth = __kmp_avail_proc;
6735  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6736  "__kmp_avail_proc(%d)\n",
6737  __kmp_dflt_team_nth));
6738 #endif /* KMP_DFLT_NTH_CORES */
6739  }
6740 
6741  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6742  __kmp_dflt_team_nth = KMP_MIN_NTH;
6743  }
6744  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6745  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6746  }
6747 
6748  // There's no harm in continuing if the following check fails,
6749  // but it indicates an error in the previous logic.
6750  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6751 
6752  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6753  // Run through the __kmp_threads array and set the num threads icv for each
6754  // root thread that is currently registered with the RTL (which has not
6755  // already explicitly set its nthreads-var with a call to
6756  // omp_set_num_threads()).
6757  for (i = 0; i < __kmp_threads_capacity; i++) {
6758  kmp_info_t *thread = __kmp_threads[i];
6759  if (thread == NULL)
6760  continue;
6761  if (thread->th.th_current_task->td_icvs.nproc != 0)
6762  continue;
6763 
6764  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6765  }
6766  }
6767  KA_TRACE(
6768  20,
6769  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6770  __kmp_dflt_team_nth));
6771 
6772 #ifdef KMP_ADJUST_BLOCKTIME
6773  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6774  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6775  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6776  if (__kmp_nth > __kmp_avail_proc) {
6777  __kmp_zero_bt = TRUE;
6778  }
6779  }
6780 #endif /* KMP_ADJUST_BLOCKTIME */
6781 
6782  /* we have finished middle initialization */
6783  TCW_SYNC_4(__kmp_init_middle, TRUE);
6784 
6785  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6786 }
6787 
6788 void __kmp_middle_initialize(void) {
6789  if (__kmp_init_middle) {
6790  return;
6791  }
6792  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6793  if (__kmp_init_middle) {
6794  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6795  return;
6796  }
6797  __kmp_do_middle_initialize();
6798  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6799 }
6800 
6801 void __kmp_parallel_initialize(void) {
6802  int gtid = __kmp_entry_gtid(); // this might be a new root
6803 
6804  /* synchronize parallel initialization (for sibling) */
6805  if (TCR_4(__kmp_init_parallel))
6806  return;
6807  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6808  if (TCR_4(__kmp_init_parallel)) {
6809  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6810  return;
6811  }
6812 
6813  /* TODO reinitialization after we have already shut down */
6814  if (TCR_4(__kmp_global.g.g_done)) {
6815  KA_TRACE(
6816  10,
6817  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6818  __kmp_infinite_loop();
6819  }
6820 
6821  /* jc: The lock __kmp_initz_lock is already held, so calling
6822  __kmp_serial_initialize would cause a deadlock. So we call
6823  __kmp_do_serial_initialize directly. */
6824  if (!__kmp_init_middle) {
6825  __kmp_do_middle_initialize();
6826  }
6827 
6828  /* begin initialization */
6829  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6830  KMP_ASSERT(KMP_UBER_GTID(gtid));
6831 
6832 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6833  // Save the FP control regs.
6834  // Worker threads will set theirs to these values at thread startup.
6835  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6836  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6837  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6838 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6839 
6840 #if KMP_OS_UNIX
6841 #if KMP_HANDLE_SIGNALS
6842  /* must be after __kmp_serial_initialize */
6843  __kmp_install_signals(TRUE);
6844 #endif
6845 #endif
6846 
6847  __kmp_suspend_initialize();
6848 
6849 #if defined(USE_LOAD_BALANCE)
6850  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6851  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6852  }
6853 #else
6854  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6855  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6856  }
6857 #endif
6858 
6859  if (__kmp_version) {
6860  __kmp_print_version_2();
6861  }
6862 
6863  /* we have finished parallel initialization */
6864  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6865 
6866  KMP_MB();
6867  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6868 
6869  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6870 }
6871 
6872 /* ------------------------------------------------------------------------ */
6873 
6874 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6875  kmp_team_t *team) {
6876  kmp_disp_t *dispatch;
6877 
6878  KMP_MB();
6879 
6880  /* none of the threads have encountered any constructs, yet. */
6881  this_thr->th.th_local.this_construct = 0;
6882 #if KMP_CACHE_MANAGE
6883  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6884 #endif /* KMP_CACHE_MANAGE */
6885  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6886  KMP_DEBUG_ASSERT(dispatch);
6887  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6888  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6889  // this_thr->th.th_info.ds.ds_tid ] );
6890 
6891  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6892 #if OMP_45_ENABLED
6893  dispatch->th_doacross_buf_idx =
6894  0; /* reset the doacross dispatch buffer counter */
6895 #endif
6896  if (__kmp_env_consistency_check)
6897  __kmp_push_parallel(gtid, team->t.t_ident);
6898 
6899  KMP_MB(); /* Flush all pending memory write invalidates. */
6900 }
6901 
6902 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6903  kmp_team_t *team) {
6904  if (__kmp_env_consistency_check)
6905  __kmp_pop_parallel(gtid, team->t.t_ident);
6906 
6907  __kmp_finish_implicit_task(this_thr);
6908 }
6909 
6910 int __kmp_invoke_task_func(int gtid) {
6911  int rc;
6912  int tid = __kmp_tid_from_gtid(gtid);
6913  kmp_info_t *this_thr = __kmp_threads[gtid];
6914  kmp_team_t *team = this_thr->th.th_team;
6915 
6916  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6917 #if USE_ITT_BUILD
6918  if (__itt_stack_caller_create_ptr) {
6919  __kmp_itt_stack_callee_enter(
6920  (__itt_caller)
6921  team->t.t_stack_id); // inform ittnotify about entering user's code
6922  }
6923 #endif /* USE_ITT_BUILD */
6924 #if INCLUDE_SSC_MARKS
6925  SSC_MARK_INVOKING();
6926 #endif
6927 
6928 #if OMPT_SUPPORT
6929  void *dummy;
6930  void **exit_runtime_p;
6931  ompt_data_t *my_task_data;
6932  ompt_data_t *my_parallel_data;
6933  int ompt_team_size;
6934 
6935  if (ompt_enabled.enabled) {
6936  exit_runtime_p = &(
6937  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6938  } else {
6939  exit_runtime_p = &dummy;
6940  }
6941 
6942  my_task_data =
6943  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6944  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6945  if (ompt_enabled.ompt_callback_implicit_task) {
6946  ompt_team_size = team->t.t_nproc;
6947  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6948  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6949  __kmp_tid_from_gtid(gtid));
6950  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6951  }
6952 #endif
6953 
6954  {
6955  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6956  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6957  rc =
6958  __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6959  tid, (int)team->t.t_argc, (void **)team->t.t_argv
6960 #if OMPT_SUPPORT
6961  ,
6962  exit_runtime_p
6963 #endif
6964  );
6965 #if OMPT_SUPPORT
6966  *exit_runtime_p = NULL;
6967 #endif
6968  }
6969 
6970 #if USE_ITT_BUILD
6971  if (__itt_stack_caller_create_ptr) {
6972  __kmp_itt_stack_callee_leave(
6973  (__itt_caller)
6974  team->t.t_stack_id); // inform ittnotify about leaving user's code
6975  }
6976 #endif /* USE_ITT_BUILD */
6977  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6978 
6979  return rc;
6980 }
6981 
6982 #if OMP_40_ENABLED
6983 void __kmp_teams_master(int gtid) {
6984  // This routine is called by all master threads in teams construct
6985  kmp_info_t *thr = __kmp_threads[gtid];
6986  kmp_team_t *team = thr->th.th_team;
6987  ident_t *loc = team->t.t_ident;
6988  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6989  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6990  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6991  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6992  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6993 // Launch league of teams now, but not let workers execute
6994 // (they hang on fork barrier until next parallel)
6995 #if INCLUDE_SSC_MARKS
6996  SSC_MARK_FORKING();
6997 #endif
6998  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6999  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7000  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7001 #if INCLUDE_SSC_MARKS
7002  SSC_MARK_JOINING();
7003 #endif
7004 
7005  // AC: last parameter "1" eliminates join barrier which won't work because
7006  // worker threads are in a fork barrier waiting for more parallel regions
7007  __kmp_join_call(loc, gtid
7008 #if OMPT_SUPPORT
7009  ,
7010  fork_context_intel
7011 #endif
7012  ,
7013  1);
7014 }
7015 
7016 int __kmp_invoke_teams_master(int gtid) {
7017  kmp_info_t *this_thr = __kmp_threads[gtid];
7018  kmp_team_t *team = this_thr->th.th_team;
7019 #if KMP_DEBUG
7020  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7021  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7022  (void *)__kmp_teams_master);
7023 #endif
7024  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7025  __kmp_teams_master(gtid);
7026  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7027  return 1;
7028 }
7029 #endif /* OMP_40_ENABLED */
7030 
7031 /* this sets the requested number of threads for the next parallel region
7032  encountered by this team. since this should be enclosed in the forkjoin
7033  critical section it should avoid race conditions with assymmetrical nested
7034  parallelism */
7035 
7036 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7037  kmp_info_t *thr = __kmp_threads[gtid];
7038 
7039  if (num_threads > 0)
7040  thr->th.th_set_nproc = num_threads;
7041 }
7042 
7043 #if OMP_40_ENABLED
7044 
7045 /* this sets the requested number of teams for the teams region and/or
7046  the number of threads for the next parallel region encountered */
7047 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7048  int num_threads) {
7049  kmp_info_t *thr = __kmp_threads[gtid];
7050  KMP_DEBUG_ASSERT(num_teams >= 0);
7051  KMP_DEBUG_ASSERT(num_threads >= 0);
7052 
7053  if (num_teams == 0)
7054  num_teams = 1; // default number of teams is 1.
7055  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7056  if (!__kmp_reserve_warn) {
7057  __kmp_reserve_warn = 1;
7058  __kmp_msg(kmp_ms_warning,
7059  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7060  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7061  }
7062  num_teams = __kmp_teams_max_nth;
7063  }
7064  // Set number of teams (number of threads in the outer "parallel" of the
7065  // teams)
7066  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7067 
7068  // Remember the number of threads for inner parallel regions
7069  if (num_threads == 0) {
7070  if (!TCR_4(__kmp_init_middle))
7071  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7072  num_threads = __kmp_avail_proc / num_teams;
7073  if (num_teams * num_threads > __kmp_teams_max_nth) {
7074  // adjust num_threads w/o warning as it is not user setting
7075  num_threads = __kmp_teams_max_nth / num_teams;
7076  }
7077  } else {
7078  if (num_teams * num_threads > __kmp_teams_max_nth) {
7079  int new_threads = __kmp_teams_max_nth / num_teams;
7080  if (!__kmp_reserve_warn) { // user asked for too many threads
7081  __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7082  __kmp_msg(kmp_ms_warning,
7083  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7084  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7085  }
7086  num_threads = new_threads;
7087  }
7088  }
7089  thr->th.th_teams_size.nth = num_threads;
7090 }
7091 
7092 // Set the proc_bind var to use in the following parallel region.
7093 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7094  kmp_info_t *thr = __kmp_threads[gtid];
7095  thr->th.th_set_proc_bind = proc_bind;
7096 }
7097 
7098 #endif /* OMP_40_ENABLED */
7099 
7100 /* Launch the worker threads into the microtask. */
7101 
7102 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7103  kmp_info_t *this_thr = __kmp_threads[gtid];
7104 
7105 #ifdef KMP_DEBUG
7106  int f;
7107 #endif /* KMP_DEBUG */
7108 
7109  KMP_DEBUG_ASSERT(team);
7110  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7111  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7112  KMP_MB(); /* Flush all pending memory write invalidates. */
7113 
7114  team->t.t_construct = 0; /* no single directives seen yet */
7115  team->t.t_ordered.dt.t_value =
7116  0; /* thread 0 enters the ordered section first */
7117 
7118  /* Reset the identifiers on the dispatch buffer */
7119  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7120  if (team->t.t_max_nproc > 1) {
7121  int i;
7122  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7123  team->t.t_disp_buffer[i].buffer_index = i;
7124 #if OMP_45_ENABLED
7125  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7126 #endif
7127  }
7128  } else {
7129  team->t.t_disp_buffer[0].buffer_index = 0;
7130 #if OMP_45_ENABLED
7131  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7132 #endif
7133  }
7134 
7135  KMP_MB(); /* Flush all pending memory write invalidates. */
7136  KMP_ASSERT(this_thr->th.th_team == team);
7137 
7138 #ifdef KMP_DEBUG
7139  for (f = 0; f < team->t.t_nproc; f++) {
7140  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7141  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7142  }
7143 #endif /* KMP_DEBUG */
7144 
7145  /* release the worker threads so they may begin working */
7146  __kmp_fork_barrier(gtid, 0);
7147 }
7148 
7149 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7150  kmp_info_t *this_thr = __kmp_threads[gtid];
7151 
7152  KMP_DEBUG_ASSERT(team);
7153  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7154  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7155  KMP_MB(); /* Flush all pending memory write invalidates. */
7156 
7157 /* Join barrier after fork */
7158 
7159 #ifdef KMP_DEBUG
7160  if (__kmp_threads[gtid] &&
7161  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7162  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7163  __kmp_threads[gtid]);
7164  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7165  "team->t.t_nproc=%d\n",
7166  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7167  team->t.t_nproc);
7168  __kmp_print_structure();
7169  }
7170  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7171  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7172 #endif /* KMP_DEBUG */
7173 
7174  __kmp_join_barrier(gtid); /* wait for everyone */
7175 #if OMPT_SUPPORT
7176  if (ompt_enabled.enabled &&
7177  this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7178  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7179  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7180  this_thr->th.ompt_thread_info.state = omp_state_overhead;
7181 #if OMPT_OPTIONAL
7182  void *codeptr = NULL;
7183  if (KMP_MASTER_TID(ds_tid) &&
7184  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7185  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7186  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7187 
7188  if (ompt_enabled.ompt_callback_sync_region_wait) {
7189  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7190  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7191  }
7192  if (ompt_enabled.ompt_callback_sync_region) {
7193  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7194  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7195  }
7196 #endif
7197  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7198  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7199  ompt_scope_end, NULL, task_data, 0, ds_tid);
7200  }
7201  }
7202 #endif
7203 
7204  KMP_MB(); /* Flush all pending memory write invalidates. */
7205  KMP_ASSERT(this_thr->th.th_team == team);
7206 }
7207 
7208 /* ------------------------------------------------------------------------ */
7209 
7210 #ifdef USE_LOAD_BALANCE
7211 
7212 // Return the worker threads actively spinning in the hot team, if we
7213 // are at the outermost level of parallelism. Otherwise, return 0.
7214 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7215  int i;
7216  int retval;
7217  kmp_team_t *hot_team;
7218 
7219  if (root->r.r_active) {
7220  return 0;
7221  }
7222  hot_team = root->r.r_hot_team;
7223  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7224  return hot_team->t.t_nproc - 1; // Don't count master thread
7225  }
7226 
7227  // Skip the master thread - it is accounted for elsewhere.
7228  retval = 0;
7229  for (i = 1; i < hot_team->t.t_nproc; i++) {
7230  if (hot_team->t.t_threads[i]->th.th_active) {
7231  retval++;
7232  }
7233  }
7234  return retval;
7235 }
7236 
7237 // Perform an automatic adjustment to the number of
7238 // threads used by the next parallel region.
7239 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7240  int retval;
7241  int pool_active;
7242  int hot_team_active;
7243  int team_curr_active;
7244  int system_active;
7245 
7246  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7247  set_nproc));
7248  KMP_DEBUG_ASSERT(root);
7249  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7250  ->th.th_current_task->td_icvs.dynamic == TRUE);
7251  KMP_DEBUG_ASSERT(set_nproc > 1);
7252 
7253  if (set_nproc == 1) {
7254  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7255  return 1;
7256  }
7257 
7258  // Threads that are active in the thread pool, active in the hot team for this
7259  // particular root (if we are at the outer par level), and the currently
7260  // executing thread (to become the master) are available to add to the new
7261  // team, but are currently contributing to the system load, and must be
7262  // accounted for.
7263  pool_active = __kmp_thread_pool_active_nth;
7264  hot_team_active = __kmp_active_hot_team_nproc(root);
7265  team_curr_active = pool_active + hot_team_active + 1;
7266 
7267  // Check the system load.
7268  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7269  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7270  "hot team active = %d\n",
7271  system_active, pool_active, hot_team_active));
7272 
7273  if (system_active < 0) {
7274  // There was an error reading the necessary info from /proc, so use the
7275  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7276  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7277  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7278  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7279 
7280  // Make this call behave like the thread limit algorithm.
7281  retval = __kmp_avail_proc - __kmp_nth +
7282  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7283  if (retval > set_nproc) {
7284  retval = set_nproc;
7285  }
7286  if (retval < KMP_MIN_NTH) {
7287  retval = KMP_MIN_NTH;
7288  }
7289 
7290  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7291  retval));
7292  return retval;
7293  }
7294 
7295  // There is a slight delay in the load balance algorithm in detecting new
7296  // running procs. The real system load at this instant should be at least as
7297  // large as the #active omp thread that are available to add to the team.
7298  if (system_active < team_curr_active) {
7299  system_active = team_curr_active;
7300  }
7301  retval = __kmp_avail_proc - system_active + team_curr_active;
7302  if (retval > set_nproc) {
7303  retval = set_nproc;
7304  }
7305  if (retval < KMP_MIN_NTH) {
7306  retval = KMP_MIN_NTH;
7307  }
7308 
7309  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7310  return retval;
7311 } // __kmp_load_balance_nproc()
7312 
7313 #endif /* USE_LOAD_BALANCE */
7314 
7315 /* ------------------------------------------------------------------------ */
7316 
7317 /* NOTE: this is called with the __kmp_init_lock held */
7318 void __kmp_cleanup(void) {
7319  int f;
7320 
7321  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7322 
7323  if (TCR_4(__kmp_init_parallel)) {
7324 #if KMP_HANDLE_SIGNALS
7325  __kmp_remove_signals();
7326 #endif
7327  TCW_4(__kmp_init_parallel, FALSE);
7328  }
7329 
7330  if (TCR_4(__kmp_init_middle)) {
7331 #if KMP_AFFINITY_SUPPORTED
7332  __kmp_affinity_uninitialize();
7333 #endif /* KMP_AFFINITY_SUPPORTED */
7334  __kmp_cleanup_hierarchy();
7335  TCW_4(__kmp_init_middle, FALSE);
7336  }
7337 
7338  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7339 
7340  if (__kmp_init_serial) {
7341  __kmp_runtime_destroy();
7342  __kmp_init_serial = FALSE;
7343  }
7344 
7345  __kmp_cleanup_threadprivate_caches();
7346 
7347  for (f = 0; f < __kmp_threads_capacity; f++) {
7348  if (__kmp_root[f] != NULL) {
7349  __kmp_free(__kmp_root[f]);
7350  __kmp_root[f] = NULL;
7351  }
7352  }
7353  __kmp_free(__kmp_threads);
7354  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7355  // there is no need in freeing __kmp_root.
7356  __kmp_threads = NULL;
7357  __kmp_root = NULL;
7358  __kmp_threads_capacity = 0;
7359 
7360 #if KMP_USE_DYNAMIC_LOCK
7361  __kmp_cleanup_indirect_user_locks();
7362 #else
7363  __kmp_cleanup_user_locks();
7364 #endif
7365 
7366 #if KMP_AFFINITY_SUPPORTED
7367  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7368  __kmp_cpuinfo_file = NULL;
7369 #endif /* KMP_AFFINITY_SUPPORTED */
7370 
7371 #if KMP_USE_ADAPTIVE_LOCKS
7372 #if KMP_DEBUG_ADAPTIVE_LOCKS
7373  __kmp_print_speculative_stats();
7374 #endif
7375 #endif
7376  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7377  __kmp_nested_nth.nth = NULL;
7378  __kmp_nested_nth.size = 0;
7379  __kmp_nested_nth.used = 0;
7380  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7381  __kmp_nested_proc_bind.bind_types = NULL;
7382  __kmp_nested_proc_bind.size = 0;
7383  __kmp_nested_proc_bind.used = 0;
7384 
7385  __kmp_i18n_catclose();
7386 
7387 #if KMP_USE_HIER_SCHED
7388  __kmp_hier_scheds.deallocate();
7389 #endif
7390 
7391 #if KMP_STATS_ENABLED
7392  __kmp_stats_fini();
7393 #endif
7394 
7395  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7396 }
7397 
7398 /* ------------------------------------------------------------------------ */
7399 
7400 int __kmp_ignore_mppbeg(void) {
7401  char *env;
7402 
7403  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7404  if (__kmp_str_match_false(env))
7405  return FALSE;
7406  }
7407  // By default __kmpc_begin() is no-op.
7408  return TRUE;
7409 }
7410 
7411 int __kmp_ignore_mppend(void) {
7412  char *env;
7413 
7414  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7415  if (__kmp_str_match_false(env))
7416  return FALSE;
7417  }
7418  // By default __kmpc_end() is no-op.
7419  return TRUE;
7420 }
7421 
7422 void __kmp_internal_begin(void) {
7423  int gtid;
7424  kmp_root_t *root;
7425 
7426  /* this is a very important step as it will register new sibling threads
7427  and assign these new uber threads a new gtid */
7428  gtid = __kmp_entry_gtid();
7429  root = __kmp_threads[gtid]->th.th_root;
7430  KMP_ASSERT(KMP_UBER_GTID(gtid));
7431 
7432  if (root->r.r_begin)
7433  return;
7434  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7435  if (root->r.r_begin) {
7436  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7437  return;
7438  }
7439 
7440  root->r.r_begin = TRUE;
7441 
7442  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7443 }
7444 
7445 /* ------------------------------------------------------------------------ */
7446 
7447 void __kmp_user_set_library(enum library_type arg) {
7448  int gtid;
7449  kmp_root_t *root;
7450  kmp_info_t *thread;
7451 
7452  /* first, make sure we are initialized so we can get our gtid */
7453 
7454  gtid = __kmp_entry_gtid();
7455  thread = __kmp_threads[gtid];
7456 
7457  root = thread->th.th_root;
7458 
7459  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7460  library_serial));
7461  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7462  thread */
7463  KMP_WARNING(SetLibraryIncorrectCall);
7464  return;
7465  }
7466 
7467  switch (arg) {
7468  case library_serial:
7469  thread->th.th_set_nproc = 0;
7470  set__nproc(thread, 1);
7471  break;
7472  case library_turnaround:
7473  thread->th.th_set_nproc = 0;
7474  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7475  : __kmp_dflt_team_nth_ub);
7476  break;
7477  case library_throughput:
7478  thread->th.th_set_nproc = 0;
7479  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7480  : __kmp_dflt_team_nth_ub);
7481  break;
7482  default:
7483  KMP_FATAL(UnknownLibraryType, arg);
7484  }
7485 
7486  __kmp_aux_set_library(arg);
7487 }
7488 
7489 void __kmp_aux_set_stacksize(size_t arg) {
7490  if (!__kmp_init_serial)
7491  __kmp_serial_initialize();
7492 
7493 #if KMP_OS_DARWIN
7494  if (arg & (0x1000 - 1)) {
7495  arg &= ~(0x1000 - 1);
7496  if (arg + 0x1000) /* check for overflow if we round up */
7497  arg += 0x1000;
7498  }
7499 #endif
7500  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7501 
7502  /* only change the default stacksize before the first parallel region */
7503  if (!TCR_4(__kmp_init_parallel)) {
7504  size_t value = arg; /* argument is in bytes */
7505 
7506  if (value < __kmp_sys_min_stksize)
7507  value = __kmp_sys_min_stksize;
7508  else if (value > KMP_MAX_STKSIZE)
7509  value = KMP_MAX_STKSIZE;
7510 
7511  __kmp_stksize = value;
7512 
7513  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7514  }
7515 
7516  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7517 }
7518 
7519 /* set the behaviour of the runtime library */
7520 /* TODO this can cause some odd behaviour with sibling parallelism... */
7521 void __kmp_aux_set_library(enum library_type arg) {
7522  __kmp_library = arg;
7523 
7524  switch (__kmp_library) {
7525  case library_serial: {
7526  KMP_INFORM(LibraryIsSerial);
7527  (void)__kmp_change_library(TRUE);
7528  } break;
7529  case library_turnaround:
7530  (void)__kmp_change_library(TRUE);
7531  break;
7532  case library_throughput:
7533  (void)__kmp_change_library(FALSE);
7534  break;
7535  default:
7536  KMP_FATAL(UnknownLibraryType, arg);
7537  }
7538 }
7539 
7540 /* ------------------------------------------------------------------------ */
7541 
7542 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7543  int blocktime = arg; /* argument is in milliseconds */
7544 #if KMP_USE_MONITOR
7545  int bt_intervals;
7546 #endif
7547  int bt_set;
7548 
7549  __kmp_save_internal_controls(thread);
7550 
7551  /* Normalize and set blocktime for the teams */
7552  if (blocktime < KMP_MIN_BLOCKTIME)
7553  blocktime = KMP_MIN_BLOCKTIME;
7554  else if (blocktime > KMP_MAX_BLOCKTIME)
7555  blocktime = KMP_MAX_BLOCKTIME;
7556 
7557  set__blocktime_team(thread->th.th_team, tid, blocktime);
7558  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7559 
7560 #if KMP_USE_MONITOR
7561  /* Calculate and set blocktime intervals for the teams */
7562  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7563 
7564  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7565  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7566 #endif
7567 
7568  /* Set whether blocktime has been set to "TRUE" */
7569  bt_set = TRUE;
7570 
7571  set__bt_set_team(thread->th.th_team, tid, bt_set);
7572  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7573 #if KMP_USE_MONITOR
7574  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7575  "bt_intervals=%d, monitor_updates=%d\n",
7576  __kmp_gtid_from_tid(tid, thread->th.th_team),
7577  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7578  __kmp_monitor_wakeups));
7579 #else
7580  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7581  __kmp_gtid_from_tid(tid, thread->th.th_team),
7582  thread->th.th_team->t.t_id, tid, blocktime));
7583 #endif
7584 }
7585 
7586 void __kmp_aux_set_defaults(char const *str, int len) {
7587  if (!__kmp_init_serial) {
7588  __kmp_serial_initialize();
7589  }
7590  __kmp_env_initialize(str);
7591 
7592  if (__kmp_settings
7593 #if OMP_40_ENABLED
7594  || __kmp_display_env || __kmp_display_env_verbose
7595 #endif // OMP_40_ENABLED
7596  ) {
7597  __kmp_env_print();
7598  }
7599 } // __kmp_aux_set_defaults
7600 
7601 /* ------------------------------------------------------------------------ */
7602 /* internal fast reduction routines */
7603 
7604 PACKED_REDUCTION_METHOD_T
7605 __kmp_determine_reduction_method(
7606  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7607  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7608  kmp_critical_name *lck) {
7609 
7610  // Default reduction method: critical construct ( lck != NULL, like in current
7611  // PAROPT )
7612  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7613  // can be selected by RTL
7614  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7615  // can be selected by RTL
7616  // Finally, it's up to OpenMP RTL to make a decision on which method to select
7617  // among generated by PAROPT.
7618 
7619  PACKED_REDUCTION_METHOD_T retval;
7620 
7621  int team_size;
7622 
7623  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7624  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7625 
7626 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
7627  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7628 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7629 
7630  retval = critical_reduce_block;
7631 
7632  // another choice of getting a team size (with 1 dynamic deference) is slower
7633  team_size = __kmp_get_team_num_threads(global_tid);
7634  if (team_size == 1) {
7635 
7636  retval = empty_reduce_block;
7637 
7638  } else {
7639 
7640  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7641  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7642 
7643 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7644 
7645 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || \
7646  KMP_OS_DARWIN || KMP_OS_HURD
7647 
7648  int teamsize_cutoff = 4;
7649 
7650 #if KMP_MIC_SUPPORTED
7651  if (__kmp_mic_type != non_mic) {
7652  teamsize_cutoff = 8;
7653  }
7654 #endif
7655  if (tree_available) {
7656  if (team_size <= teamsize_cutoff) {
7657  if (atomic_available) {
7658  retval = atomic_reduce_block;
7659  }
7660  } else {
7661  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7662  }
7663  } else if (atomic_available) {
7664  retval = atomic_reduce_block;
7665  }
7666 #else
7667 #error "Unknown or unsupported OS"
7668 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7669 // KMP_OS_DARWIN
7670 
7671 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7672 
7673 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD
7674 
7675  // basic tuning
7676 
7677  if (atomic_available) {
7678  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7679  retval = atomic_reduce_block;
7680  }
7681  } // otherwise: use critical section
7682 
7683 #elif KMP_OS_DARWIN
7684 
7685  if (atomic_available && (num_vars <= 3)) {
7686  retval = atomic_reduce_block;
7687  } else if (tree_available) {
7688  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7689  (reduce_size < (2000 * sizeof(kmp_real64)))) {
7690  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7691  }
7692  } // otherwise: use critical section
7693 
7694 #else
7695 #error "Unknown or unsupported OS"
7696 #endif
7697 
7698 #else
7699 #error "Unknown or unsupported architecture"
7700 #endif
7701  }
7702 
7703  // KMP_FORCE_REDUCTION
7704 
7705  // If the team is serialized (team_size == 1), ignore the forced reduction
7706  // method and stay with the unsynchronized method (empty_reduce_block)
7707  if (__kmp_force_reduction_method != reduction_method_not_defined &&
7708  team_size != 1) {
7709 
7710  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7711 
7712  int atomic_available, tree_available;
7713 
7714  switch ((forced_retval = __kmp_force_reduction_method)) {
7715  case critical_reduce_block:
7716  KMP_ASSERT(lck); // lck should be != 0
7717  break;
7718 
7719  case atomic_reduce_block:
7720  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7721  if (!atomic_available) {
7722  KMP_WARNING(RedMethodNotSupported, "atomic");
7723  forced_retval = critical_reduce_block;
7724  }
7725  break;
7726 
7727  case tree_reduce_block:
7728  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7729  if (!tree_available) {
7730  KMP_WARNING(RedMethodNotSupported, "tree");
7731  forced_retval = critical_reduce_block;
7732  } else {
7733 #if KMP_FAST_REDUCTION_BARRIER
7734  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7735 #endif
7736  }
7737  break;
7738 
7739  default:
7740  KMP_ASSERT(0); // "unsupported method specified"
7741  }
7742 
7743  retval = forced_retval;
7744  }
7745 
7746  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7747 
7748 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7749 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7750 
7751  return (retval);
7752 }
7753 
7754 // this function is for testing set/get/determine reduce method
7755 kmp_int32 __kmp_get_reduce_method(void) {
7756  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7757 }
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:877
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:186
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:919
sched_type
Definition: kmp.h:320
Definition: kmp.h:207
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
kmp_int32 flags
Definition: kmp.h:209