LLVM OpenMP* Runtime Library
kmp_runtime.c
1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_atomic.h"
18 #include "kmp_wrapper_getpid.h"
19 #include "kmp_environment.h"
20 #include "kmp_itt.h"
21 #include "kmp_str.h"
22 #include "kmp_settings.h"
23 #include "kmp_i18n.h"
24 #include "kmp_io.h"
25 #include "kmp_error.h"
26 #include "kmp_stats.h"
27 #include "kmp_wait_release.h"
28 
29 #if OMPT_SUPPORT
30 #include "ompt-specific.h"
31 #endif
32 
33 /* these are temporary issues to be dealt with */
34 #define KMP_USE_PRCTL 0
35 #define KMP_USE_POOLED_ALLOC 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 
42 #if defined(KMP_GOMP_COMPAT)
43 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_40_ENABLED
48  "4.0 (201307)";
49 #else
50  "3.1 (201107)";
51 #endif
52 
53 #ifdef KMP_DEBUG
54 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
56 
57 
58 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
59 
60 /* ------------------------------------------------------------------------ */
61 /* ------------------------------------------------------------------------ */
62 
63 kmp_info_t __kmp_monitor;
64 
65 /* ------------------------------------------------------------------------ */
66 /* ------------------------------------------------------------------------ */
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup( void );
71 
72 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
73 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
74 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places( kmp_team_t *team );
76 #endif
77 static void __kmp_do_serial_initialize( void );
78 void __kmp_fork_barrier( int gtid, int tid );
79 void __kmp_join_barrier( int gtid );
80 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
81 
82 #ifdef USE_LOAD_BALANCE
83 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
84 #endif
85 
86 static int __kmp_expand_threads(int nWish, int nNeed);
87 #if KMP_OS_WINDOWS
88 static int __kmp_unregister_root_other_thread( int gtid );
89 #endif
90 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
91 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
92 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
93 
94 /* ------------------------------------------------------------------------ */
95 /* ------------------------------------------------------------------------ */
96 
97 /* Calculate the identifier of the current thread */
98 /* fast (and somewhat portable) way to get unique */
99 /* identifier of executing thread. */
100 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
101 
102 int
103 __kmp_get_global_thread_id( )
104 {
105  int i;
106  kmp_info_t **other_threads;
107  size_t stack_data;
108  char *stack_addr;
109  size_t stack_size;
110  char *stack_base;
111 
112  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
113  __kmp_nth, __kmp_all_nth ));
114 
115  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
116  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
117  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118  __kmp_init_gtid for this to work. */
119 
120  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
121 
122 #ifdef KMP_TDATA_GTID
123  if ( TCR_4(__kmp_gtid_mode) >= 3) {
124  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
125  return __kmp_gtid;
126  }
127 #endif
128  if ( TCR_4(__kmp_gtid_mode) >= 2) {
129  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
130  return __kmp_gtid_get_specific();
131  }
132  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
133 
134  stack_addr = (char*) & stack_data;
135  other_threads = __kmp_threads;
136 
137  /*
138  ATT: The code below is a source of potential bugs due to unsynchronized access to
139  __kmp_threads array. For example:
140  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
141  2. Current thread is suspended by OS.
142  3. Another thread unregisters and finishes (debug versions of free() may fill memory
143  with something like 0xEF).
144  4. Current thread is resumed.
145  5. Current thread reads junk from *thr.
146  TODO: Fix it.
147  --ln
148  */
149 
150  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
151 
152  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
153  if( !thr ) continue;
154 
155  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158  /* stack grows down -- search through all of the active threads */
159 
160  if( stack_addr <= stack_base ) {
161  size_t stack_diff = stack_base - stack_addr;
162 
163  if( stack_diff <= stack_size ) {
164  /* The only way we can be closer than the allocated */
165  /* stack size is if we are running on this thread. */
166  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
167  return i;
168  }
169  }
170  }
171 
172  /* get specific to try and determine our gtid */
173  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
174  "thread, using TLS\n" ));
175  i = __kmp_gtid_get_specific();
176 
177  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
178 
179  /* if we havn't been assigned a gtid, then return code */
180  if( i<0 ) return i;
181 
182  /* dynamically updated stack window for uber threads to avoid get_specific call */
183  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
184  KMP_FATAL( StackOverflow, i );
185  }
186 
187  stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
188  if( stack_addr > stack_base ) {
189  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
192  } else {
193  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
194  }
195 
196  /* Reprint stack bounds for ubermaster since they have been refined */
197  if ( __kmp_storage_map ) {
198  char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
199  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
200  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
201  other_threads[i]->th.th_info.ds.ds_stacksize,
202  "th_%d stack (refinement)", i );
203  }
204  return i;
205 }
206 
207 int
208 __kmp_get_global_thread_id_reg( )
209 {
210  int gtid;
211 
212  if ( !__kmp_init_serial ) {
213  gtid = KMP_GTID_DNE;
214  } else
215 #ifdef KMP_TDATA_GTID
216  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
217  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
218  gtid = __kmp_gtid;
219  } else
220 #endif
221  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
222  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
223  gtid = __kmp_gtid_get_specific();
224  } else {
225  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
226  gtid = __kmp_get_global_thread_id();
227  }
228 
229  /* we must be a new uber master sibling thread */
230  if( gtid == KMP_GTID_DNE ) {
231  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
232  "Registering a new gtid.\n" ));
233  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
234  if( !__kmp_init_serial ) {
235  __kmp_do_serial_initialize();
236  gtid = __kmp_gtid_get_specific();
237  } else {
238  gtid = __kmp_register_root(FALSE);
239  }
240  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
241  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
242  }
243 
244  KMP_DEBUG_ASSERT( gtid >=0 );
245 
246  return gtid;
247 }
248 
249 /* caller must hold forkjoin_lock */
250 void
251 __kmp_check_stack_overlap( kmp_info_t *th )
252 {
253  int f;
254  char *stack_beg = NULL;
255  char *stack_end = NULL;
256  int gtid;
257 
258  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
259  if ( __kmp_storage_map ) {
260  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
261  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262 
263  gtid = __kmp_gtid_from_thread( th );
264 
265  if (gtid == KMP_GTID_MONITOR) {
266  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
267  "th_%s stack (%s)", "mon",
268  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
269  } else {
270  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271  "th_%d stack (%s)", gtid,
272  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
273  }
274  }
275 
276  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
277  gtid = __kmp_gtid_from_thread( th );
278  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
279  {
280  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
281  if ( stack_beg == NULL ) {
282  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
283  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
284  }
285 
286  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
287  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
288 
289  if( f_th && f_th != th ) {
290  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
291  char *other_stack_beg = other_stack_end -
292  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
293  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
294  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
295 
296  /* Print the other stack values before the abort */
297  if ( __kmp_storage_map )
298  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
299  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
300  "th_%d stack (overlapped)",
301  __kmp_gtid_from_thread( f_th ) );
302 
303  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
304  }
305  }
306  }
307  }
308  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
309 }
310 
311 
312 /* ------------------------------------------------------------------------ */
313 
314 /* ------------------------------------------------------------------------ */
315 
316 void
317 __kmp_infinite_loop( void )
318 {
319  static int done = FALSE;
320 
321  while (! done) {
322  KMP_YIELD( 1 );
323  }
324 }
325 
326 #define MAX_MESSAGE 512
327 
328 void
329 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
330  char buffer[MAX_MESSAGE];
331  va_list ap;
332 
333  va_start( ap, format);
334  KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
335  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
336  __kmp_vprintf( kmp_err, buffer, ap );
337 #if KMP_PRINT_DATA_PLACEMENT
338  int node;
339  if(gtid >= 0) {
340  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
341  if( __kmp_storage_map_verbose ) {
342  node = __kmp_get_host_node(p1);
343  if(node < 0) /* doesn't work, so don't try this next time */
344  __kmp_storage_map_verbose = FALSE;
345  else {
346  char *last;
347  int lastNode;
348  int localProc = __kmp_get_cpu_from_gtid(gtid);
349 
350  p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
351  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
352  if(localProc >= 0)
353  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
354  else
355  __kmp_printf_no_lock(" GTID %d\n", gtid);
356 # if KMP_USE_PRCTL
357 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
358  do {
359  last = p1;
360  lastNode = node;
361  /* This loop collates adjacent pages with the same host node. */
362  do {
363  (char*)p1 += PAGE_SIZE;
364  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
365  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
366  (char*)p1 - 1, lastNode);
367  } while(p1 <= p2);
368 # else
369  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
370  (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
371  if(p1 < p2) {
372  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
373  (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
374  }
375 # endif
376  }
377  }
378  } else
379  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
380  }
381 #endif /* KMP_PRINT_DATA_PLACEMENT */
382  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
383 }
384 
385 void
386 __kmp_warn( char const * format, ... )
387 {
388  char buffer[MAX_MESSAGE];
389  va_list ap;
390 
391  if ( __kmp_generate_warnings == kmp_warnings_off ) {
392  return;
393  }
394 
395  va_start( ap, format );
396 
397  KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
398  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
399  __kmp_vprintf( kmp_err, buffer, ap );
400  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
401 
402  va_end( ap );
403 }
404 
405 void
406 __kmp_abort_process()
407 {
408 
409  // Later threads may stall here, but that's ok because abort() will kill them.
410  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
411 
412  if ( __kmp_debug_buf ) {
413  __kmp_dump_debug_buffer();
414  }; // if
415 
416  if ( KMP_OS_WINDOWS ) {
417  // Let other threads know of abnormal termination and prevent deadlock
418  // if abort happened during library initialization or shutdown
419  __kmp_global.g.g_abort = SIGABRT;
420 
421  /*
422  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
423  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
424  works well, but this function is not available in VS7 (this is not problem for DLL, but
425  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
426  not help, at least in some versions of MS C RTL.
427 
428  It seems following sequence is the only way to simulate abort() and avoid pop-up error
429  box.
430  */
431  raise( SIGABRT );
432  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
433  } else {
434  abort();
435  }; // if
436 
437  __kmp_infinite_loop();
438  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
439 
440 } // __kmp_abort_process
441 
442 void
443 __kmp_abort_thread( void )
444 {
445  // TODO: Eliminate g_abort global variable and this function.
446  // In case of abort just call abort(), it will kill all the threads.
447  __kmp_infinite_loop();
448 } // __kmp_abort_thread
449 
450 /* ------------------------------------------------------------------------ */
451 
452 /*
453  * Print out the storage map for the major kmp_info_t thread data structures
454  * that are allocated together.
455  */
456 
457 static void
458 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
459 {
460  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
461 
462  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
463  "th_%d.th_info", gtid );
464 
465  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
466  "th_%d.th_local", gtid );
467 
468  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
469  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
470 
471  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
472  &thr->th.th_bar[bs_plain_barrier+1],
473  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
474 
475  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
476  &thr->th.th_bar[bs_forkjoin_barrier+1],
477  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
478 
479  #if KMP_FAST_REDUCTION_BARRIER
480  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
481  &thr->th.th_bar[bs_reduction_barrier+1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
483  #endif // KMP_FAST_REDUCTION_BARRIER
484 }
485 
486 /*
487  * Print out the storage map for the major kmp_team_t team data structures
488  * that are allocated together.
489  */
490 
491 static void
492 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
493 {
494  int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
495  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
496  header, team_id );
497 
498  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
499  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
500 
501 
502  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
503  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
504 
505  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
506  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
507 
508  #if KMP_FAST_REDUCTION_BARRIER
509  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
510  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
511  #endif // KMP_FAST_REDUCTION_BARRIER
512 
513  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
514  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
515 
516  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
517  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
518 
519  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
520  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
521  header, team_id );
522 
523  /*
524  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
525  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
526 
527  __kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
528  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
529 
530  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
531  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
532 
533  __kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
534  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
535 
536  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
537  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
538 
539  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
540  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
541 
542  //__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
543  // sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
544 
545  __kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
546  sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
547 #if OMP_40_ENABLED
548  __kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
549  sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
550 #endif
551  */
552 
553  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
554  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
555 }
556 
557 static void __kmp_init_allocator() {}
558 static void __kmp_fini_allocator() {}
559 
560 /* ------------------------------------------------------------------------ */
561 
562 #ifdef KMP_DYNAMIC_LIB
563 # if KMP_OS_WINDOWS
564 
565 
566 static void
567 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
568  // TODO: Change to __kmp_break_bootstrap_lock().
569  __kmp_init_bootstrap_lock( lck ); // make the lock released
570 }
571 
572 static void
573 __kmp_reset_locks_on_process_detach( int gtid_req ) {
574  int i;
575  int thread_count;
576 
577  // PROCESS_DETACH is expected to be called by a thread
578  // that executes ProcessExit() or FreeLibrary().
579  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
580  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
581  // However, in fact, some threads can be still alive here, although being about to be terminated.
582  // The threads in the array with ds_thread==0 are most suspicious.
583  // Actually, it can be not safe to access the __kmp_threads[].
584 
585  // TODO: does it make sense to check __kmp_roots[] ?
586 
587  // Let's check that there are no other alive threads registered with the OMP lib.
588  while( 1 ) {
589  thread_count = 0;
590  for( i = 0; i < __kmp_threads_capacity; ++i ) {
591  if( !__kmp_threads ) continue;
592  kmp_info_t* th = __kmp_threads[ i ];
593  if( th == NULL ) continue;
594  int gtid = th->th.th_info.ds.ds_gtid;
595  if( gtid == gtid_req ) continue;
596  if( gtid < 0 ) continue;
597  DWORD exit_val;
598  int alive = __kmp_is_thread_alive( th, &exit_val );
599  if( alive ) {
600  ++thread_count;
601  }
602  }
603  if( thread_count == 0 ) break; // success
604  }
605 
606  // Assume that I'm alone.
607 
608  // Now it might be probably safe to check and reset locks.
609  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
610  __kmp_reset_lock( &__kmp_forkjoin_lock );
611  #ifdef KMP_DEBUG
612  __kmp_reset_lock( &__kmp_stdio_lock );
613  #endif // KMP_DEBUG
614 
615 
616 }
617 
618 BOOL WINAPI
619 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
620  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
621 
622  switch( fdwReason ) {
623 
624  case DLL_PROCESS_ATTACH:
625  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
626 
627  return TRUE;
628 
629  case DLL_PROCESS_DETACH:
630  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
631  __kmp_gtid_get_specific() ));
632 
633  if( lpReserved != NULL )
634  {
635  // lpReserved is used for telling the difference:
636  // lpReserved == NULL when FreeLibrary() was called,
637  // lpReserved != NULL when the process terminates.
638  // When FreeLibrary() is called, worker threads remain alive.
639  // So they will release the forkjoin lock by themselves.
640  // When the process terminates, worker threads disappear triggering
641  // the problem of unreleased forkjoin lock as described below.
642 
643  // A worker thread can take the forkjoin lock
644  // in __kmp_suspend_template()->__kmp_rml_decrease_load_before_sleep().
645  // The problem comes up if that worker thread becomes dead
646  // before it releases the forkjoin lock.
647  // The forkjoin lock remains taken, while the thread
648  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
649  // will try to take the forkjoin lock and will always fail,
650  // so that the application will never finish [normally].
651  // This scenario is possible if __kmpc_end() has not been executed.
652  // It looks like it's not a corner case, but common cases:
653  // - the main function was compiled by an alternative compiler;
654  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
655  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
656  // - alive foreign thread prevented __kmpc_end from doing cleanup.
657 
658  // This is a hack to work around the problem.
659  // TODO: !!! to figure out something better.
660  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
661  }
662 
663  __kmp_internal_end_library( __kmp_gtid_get_specific() );
664 
665  return TRUE;
666 
667  case DLL_THREAD_ATTACH:
668  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
669 
670  /* if we wanted to register new siblings all the time here call
671  * __kmp_get_gtid(); */
672  return TRUE;
673 
674  case DLL_THREAD_DETACH:
675  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
676  __kmp_gtid_get_specific() ));
677 
678  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
679  return TRUE;
680  }
681 
682  return TRUE;
683 }
684 
685 # endif /* KMP_OS_WINDOWS */
686 #endif /* KMP_DYNAMIC_LIB */
687 
688 
689 /* ------------------------------------------------------------------------ */
690 
691 /* Change the library type to "status" and return the old type */
692 /* called from within initialization routines where __kmp_initz_lock is held */
693 int
694 __kmp_change_library( int status )
695 {
696  int old_status;
697 
698  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
699 
700  if (status) {
701  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
702  }
703  else {
704  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
705  }
706 
707  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
708 }
709 
710 /* ------------------------------------------------------------------------ */
711 /* ------------------------------------------------------------------------ */
712 
713 /* __kmp_parallel_deo --
714  * Wait until it's our turn.
715  */
716 void
717 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
718 {
719  int gtid = *gtid_ref;
720 #ifdef BUILD_PARALLEL_ORDERED
721  kmp_team_t *team = __kmp_team_from_gtid( gtid );
722 #endif /* BUILD_PARALLEL_ORDERED */
723 
724  if( __kmp_env_consistency_check ) {
725  if( __kmp_threads[gtid]->th.th_root->r.r_active )
726 #if KMP_USE_DYNAMIC_LOCK
727  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
728 #else
729  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
730 #endif
731  }
732 #ifdef BUILD_PARALLEL_ORDERED
733  if( !team->t.t_serialized ) {
734  KMP_MB();
735  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
736  KMP_MB();
737  }
738 #endif /* BUILD_PARALLEL_ORDERED */
739 }
740 
741 /* __kmp_parallel_dxo --
742  * Signal the next task.
743  */
744 
745 void
746 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
747 {
748  int gtid = *gtid_ref;
749 #ifdef BUILD_PARALLEL_ORDERED
750  int tid = __kmp_tid_from_gtid( gtid );
751  kmp_team_t *team = __kmp_team_from_gtid( gtid );
752 #endif /* BUILD_PARALLEL_ORDERED */
753 
754  if( __kmp_env_consistency_check ) {
755  if( __kmp_threads[gtid]->th.th_root->r.r_active )
756  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
757  }
758 #ifdef BUILD_PARALLEL_ORDERED
759  if ( ! team->t.t_serialized ) {
760  KMP_MB(); /* Flush all pending memory write invalidates. */
761 
762  /* use the tid of the next thread in this team */
763  /* TODO repleace with general release procedure */
764  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
765 
766 #if OMPT_SUPPORT && OMPT_BLAME
767  if (ompt_enabled &&
768  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
769  /* accept blame for "ordered" waiting */
770  kmp_info_t *this_thread = __kmp_threads[gtid];
771  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
772  this_thread->th.ompt_thread_info.wait_id);
773  }
774 #endif
775 
776  KMP_MB(); /* Flush all pending memory write invalidates. */
777  }
778 #endif /* BUILD_PARALLEL_ORDERED */
779 }
780 
781 /* ------------------------------------------------------------------------ */
782 /* ------------------------------------------------------------------------ */
783 
784 /* ------------------------------------------------------------------------ */
785 /* ------------------------------------------------------------------------ */
786 
787 /* The BARRIER for a SINGLE process section is always explicit */
788 
789 int
790 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
791 {
792  int status;
793  kmp_info_t *th;
794  kmp_team_t *team;
795 
796  if( ! TCR_4(__kmp_init_parallel) )
797  __kmp_parallel_initialize();
798 
799  th = __kmp_threads[ gtid ];
800  team = th->th.th_team;
801  status = 0;
802 
803  th->th.th_ident = id_ref;
804 
805  if ( team->t.t_serialized ) {
806  status = 1;
807  } else {
808  kmp_int32 old_this = th->th.th_local.this_construct;
809 
810  ++th->th.th_local.this_construct;
811  /* try to set team count to thread count--success means thread got the
812  single block
813  */
814  /* TODO: Should this be acquire or release? */
815  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
816  th->th.th_local.this_construct);
817 #if USE_ITT_BUILD
818  if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
819 #if OMP_40_ENABLED
820  th->th.th_teams_microtask == NULL &&
821 #endif
822  team->t.t_active_level == 1 )
823  { // Only report metadata by master of active team at level 1
824  __kmp_itt_metadata_single( id_ref );
825  }
826 #endif /* USE_ITT_BUILD */
827  }
828 
829  if( __kmp_env_consistency_check ) {
830  if (status && push_ws) {
831  __kmp_push_workshare( gtid, ct_psingle, id_ref );
832  } else {
833  __kmp_check_workshare( gtid, ct_psingle, id_ref );
834  }
835  }
836 #if USE_ITT_BUILD
837  if ( status ) {
838  __kmp_itt_single_start( gtid );
839  }
840 #endif /* USE_ITT_BUILD */
841  return status;
842 }
843 
844 void
845 __kmp_exit_single( int gtid )
846 {
847 #if USE_ITT_BUILD
848  __kmp_itt_single_end( gtid );
849 #endif /* USE_ITT_BUILD */
850  if( __kmp_env_consistency_check )
851  __kmp_pop_workshare( gtid, ct_psingle, NULL );
852 }
853 
854 
855 /*
856  * determine if we can go parallel or must use a serialized parallel region and
857  * how many threads we can use
858  * set_nproc is the number of threads requested for the team
859  * returns 0 if we should serialize or only use one thread,
860  * otherwise the number of threads to use
861  * The forkjoin lock is held by the caller.
862  */
863 static int
864 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
865  int master_tid, int set_nthreads
866 #if OMP_40_ENABLED
867  , int enter_teams
868 #endif /* OMP_40_ENABLED */
869 )
870 {
871  int capacity;
872  int new_nthreads;
873  KMP_DEBUG_ASSERT( __kmp_init_serial );
874  KMP_DEBUG_ASSERT( root && parent_team );
875 
876  //
877  // If dyn-var is set, dynamically adjust the number of desired threads,
878  // according to the method specified by dynamic_mode.
879  //
880  new_nthreads = set_nthreads;
881  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
882  ;
883  }
884 #ifdef USE_LOAD_BALANCE
885  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
886  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
887  if ( new_nthreads == 1 ) {
888  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
889  master_tid ));
890  return 1;
891  }
892  if ( new_nthreads < set_nthreads ) {
893  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
894  master_tid, new_nthreads ));
895  }
896  }
897 #endif /* USE_LOAD_BALANCE */
898  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
899  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
900  : root->r.r_hot_team->t.t_nproc);
901  if ( new_nthreads <= 1 ) {
902  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
903  master_tid ));
904  return 1;
905  }
906  if ( new_nthreads < set_nthreads ) {
907  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
908  master_tid, new_nthreads ));
909  }
910  else {
911  new_nthreads = set_nthreads;
912  }
913  }
914  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
915  if ( set_nthreads > 2 ) {
916  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
917  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
918  if ( new_nthreads == 1 ) {
919  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
920  master_tid ));
921  return 1;
922  }
923  if ( new_nthreads < set_nthreads ) {
924  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
925  master_tid, new_nthreads ));
926  }
927  }
928  }
929  else {
930  KMP_ASSERT( 0 );
931  }
932 
933  //
934  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
935  //
936  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
937  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
938  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
939  root->r.r_hot_team->t.t_nproc );
940  if ( tl_nthreads <= 0 ) {
941  tl_nthreads = 1;
942  }
943 
944  //
945  // If dyn-var is false, emit a 1-time warning.
946  //
947  if ( ! get__dynamic_2( parent_team, master_tid )
948  && ( ! __kmp_reserve_warn ) ) {
949  __kmp_reserve_warn = 1;
950  __kmp_msg(
951  kmp_ms_warning,
952  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
953  KMP_HNT( Unset_ALL_THREADS ),
954  __kmp_msg_null
955  );
956  }
957  if ( tl_nthreads == 1 ) {
958  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
959  master_tid ));
960  return 1;
961  }
962  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
963  master_tid, tl_nthreads ));
964  new_nthreads = tl_nthreads;
965  }
966 
967 
968  //
969  // Check if the threads array is large enough, or needs expanding.
970  //
971  // See comment in __kmp_register_root() about the adjustment if
972  // __kmp_threads[0] == NULL.
973  //
974  capacity = __kmp_threads_capacity;
975  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
976  --capacity;
977  }
978  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
979  root->r.r_hot_team->t.t_nproc ) > capacity ) {
980  //
981  // Expand the threads array.
982  //
983  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
984  root->r.r_hot_team->t.t_nproc ) - capacity;
985  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
986  if ( slotsAdded < slotsRequired ) {
987  //
988  // The threads array was not expanded enough.
989  //
990  new_nthreads -= ( slotsRequired - slotsAdded );
991  KMP_ASSERT( new_nthreads >= 1 );
992 
993  //
994  // If dyn-var is false, emit a 1-time warning.
995  //
996  if ( ! get__dynamic_2( parent_team, master_tid )
997  && ( ! __kmp_reserve_warn ) ) {
998  __kmp_reserve_warn = 1;
999  if ( __kmp_tp_cached ) {
1000  __kmp_msg(
1001  kmp_ms_warning,
1002  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1003  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
1004  KMP_HNT( PossibleSystemLimitOnThreads ),
1005  __kmp_msg_null
1006  );
1007  }
1008  else {
1009  __kmp_msg(
1010  kmp_ms_warning,
1011  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1012  KMP_HNT( SystemLimitOnThreads ),
1013  __kmp_msg_null
1014  );
1015  }
1016  }
1017  }
1018  }
1019 
1020  if ( new_nthreads == 1 ) {
1021  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
1022  __kmp_get_gtid(), set_nthreads ) );
1023  return 1;
1024  }
1025 
1026  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
1027  __kmp_get_gtid(), new_nthreads, set_nthreads ));
1028  return new_nthreads;
1029 }
1030 
1031 /* ------------------------------------------------------------------------ */
1032 /* ------------------------------------------------------------------------ */
1033 
1034 /* allocate threads from the thread pool and assign them to the new team */
1035 /* we are assured that there are enough threads available, because we
1036  * checked on that earlier within critical section forkjoin */
1037 
1038 static void
1039 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1040  kmp_info_t *master_th, int master_gtid )
1041 {
1042  int i;
1043  int use_hot_team;
1044 
1045  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1046  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1047  KMP_MB();
1048 
1049  /* first, let's setup the master thread */
1050  master_th->th.th_info.ds.ds_tid = 0;
1051  master_th->th.th_team = team;
1052  master_th->th.th_team_nproc = team->t.t_nproc;
1053  master_th->th.th_team_master = master_th;
1054  master_th->th.th_team_serialized = FALSE;
1055  master_th->th.th_dispatch = & team->t.t_dispatch[ 0 ];
1056 
1057  /* make sure we are not the optimized hot team */
1058 #if KMP_NESTED_HOT_TEAMS
1059  use_hot_team = 0;
1060  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1061  if( hot_teams ) { // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1062  int level = team->t.t_active_level - 1; // index in array of hot teams
1063  if( master_th->th.th_teams_microtask ) { // are we inside the teams?
1064  if( master_th->th.th_teams_size.nteams > 1 ) {
1065  ++level; // level was not increased in teams construct for team_of_masters
1066  }
1067  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1068  master_th->th.th_teams_level == team->t.t_level ) {
1069  ++level; // level was not increased in teams construct for team_of_workers before the parallel
1070  } // team->t.t_level will be increased inside parallel
1071  }
1072  if( level < __kmp_hot_teams_max_level ) {
1073  if( hot_teams[level].hot_team ) {
1074  // hot team has already been allocated for given level
1075  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1076  use_hot_team = 1; // the team is ready to use
1077  } else {
1078  use_hot_team = 0; // AC: threads are not allocated yet
1079  hot_teams[level].hot_team = team; // remember new hot team
1080  hot_teams[level].hot_team_nth = team->t.t_nproc;
1081  }
1082  } else {
1083  use_hot_team = 0;
1084  }
1085  }
1086 #else
1087  use_hot_team = team == root->r.r_hot_team;
1088 #endif
1089  if ( !use_hot_team ) {
1090 
1091  /* install the master thread */
1092  team->t.t_threads[ 0 ] = master_th;
1093  __kmp_initialize_info( master_th, team, 0, master_gtid );
1094 
1095  /* now, install the worker threads */
1096  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
1097 
1098  /* fork or reallocate a new thread and install it in team */
1099  kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1100  team->t.t_threads[ i ] = thr;
1101  KMP_DEBUG_ASSERT( thr );
1102  KMP_DEBUG_ASSERT( thr->th.th_team == team );
1103  /* align team and thread arrived states */
1104  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%llu, plain=%llu\n",
1105  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1106  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1107  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1108  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1109 #if OMP_40_ENABLED
1110  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1111  thr->th.th_teams_level = master_th->th.th_teams_level;
1112  thr->th.th_teams_size = master_th->th.th_teams_size;
1113 #endif
1114  { // Initialize threads' barrier data.
1115  int b;
1116  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1117  for ( b = 0; b < bs_last_barrier; ++ b ) {
1118  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
1119  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1120 #if USE_DEBUGGER
1121  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1122 #endif
1123  }; // for b
1124  }
1125  }
1126 
1127 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1128  __kmp_partition_places( team );
1129 #endif
1130 
1131  }
1132 
1133  KMP_MB();
1134 }
1135 
1136 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1137 //
1138 // Propagate any changes to the floating point control registers out to the team
1139 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1140 // so we don't make changes unless they are needed.
1141 //
1142 inline static void
1143 propagateFPControl(kmp_team_t * team)
1144 {
1145  if ( __kmp_inherit_fp_control ) {
1146  kmp_int16 x87_fpu_control_word;
1147  kmp_uint32 mxcsr;
1148 
1149  // Get master values of FPU control flags (both X87 and vector)
1150  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1151  __kmp_store_mxcsr( &mxcsr );
1152  mxcsr &= KMP_X86_MXCSR_MASK;
1153 
1154  // There is no point looking at t_fp_control_saved here.
1155  // If it is TRUE, we still have to update the values if they are different from those we now have.
1156  // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1157  // that the values in the team are the same as those we have.
1158  // So, this code achieves what we need whether or not t_fp_control_saved is true.
1159  // By checking whether the value needs updating we avoid unnecessary writes that would put the
1160  // cache-line into a written state, causing all threads in the team to have to read it again.
1161  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1162  team->t.t_x87_fpu_control_word = x87_fpu_control_word;
1163  }
1164  if ( team->t.t_mxcsr != mxcsr ) {
1165  team->t.t_mxcsr = mxcsr;
1166  }
1167  // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1168  // So we must ensure it is correct.
1169  if (!team->t.t_fp_control_saved) {
1170  team->t.t_fp_control_saved = TRUE;
1171  }
1172  }
1173  else {
1174  // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1175  if (team->t.t_fp_control_saved)
1176  team->t.t_fp_control_saved = FALSE;
1177  }
1178 }
1179 
1180 // Do the opposite, setting the hardware registers to the updated values from the team.
1181 inline static void
1182 updateHWFPControl(kmp_team_t * team)
1183 {
1184  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1185  //
1186  // Only reset the fp control regs if they have been changed in the team.
1187  // the parallel region that we are exiting.
1188  //
1189  kmp_int16 x87_fpu_control_word;
1190  kmp_uint32 mxcsr;
1191  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1192  __kmp_store_mxcsr( &mxcsr );
1193  mxcsr &= KMP_X86_MXCSR_MASK;
1194 
1195  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1196  __kmp_clear_x87_fpu_status_word();
1197  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1198  }
1199 
1200  if ( team->t.t_mxcsr != mxcsr ) {
1201  __kmp_load_mxcsr( &team->t.t_mxcsr );
1202  }
1203  }
1204 }
1205 #else
1206 # define propagateFPControl(x) ((void)0)
1207 # define updateHWFPControl(x) ((void)0)
1208 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1209 
1210 static void
1211 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1212 
1213 /*
1214  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1215  */
1216 void
1217 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1218 {
1219  kmp_info_t *this_thr;
1220  kmp_team_t *serial_team;
1221 
1222  KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1223 
1224  /* Skip all this code for autopar serialized loops since it results in
1225  unacceptable overhead */
1226  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1227  return;
1228 
1229  if( ! TCR_4( __kmp_init_parallel ) )
1230  __kmp_parallel_initialize();
1231 
1232  this_thr = __kmp_threads[ global_tid ];
1233  serial_team = this_thr->th.th_serial_team;
1234 
1235  /* utilize the serialized team held by this thread */
1236  KMP_DEBUG_ASSERT( serial_team );
1237  KMP_MB();
1238 
1239  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1240  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1241  KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1242  KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1243  global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1244  this_thr->th.th_task_team = NULL;
1245  }
1246 
1247 #if OMP_40_ENABLED
1248  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1249  if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1250  proc_bind = proc_bind_false;
1251  }
1252  else if ( proc_bind == proc_bind_default ) {
1253  //
1254  // No proc_bind clause was specified, so use the current value
1255  // of proc-bind-var for this parallel region.
1256  //
1257  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1258  }
1259  //
1260  // Reset for next parallel region
1261  //
1262  this_thr->th.th_set_proc_bind = proc_bind_default;
1263 #endif /* OMP_40_ENABLED */
1264 
1265  if( this_thr->th.th_team != serial_team ) {
1266  // Nested level will be an index in the nested nthreads array
1267  int level = this_thr->th.th_team->t.t_level;
1268 
1269  if( serial_team->t.t_serialized ) {
1270  /* this serial team was already used
1271  * TODO increase performance by making this locks more specific */
1272  kmp_team_t *new_team;
1273 
1274  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1275 
1276 #if OMPT_SUPPORT
1277  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1278 #endif
1279 
1280  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1281 #if OMPT_SUPPORT
1282  ompt_parallel_id,
1283 #endif
1284 #if OMP_40_ENABLED
1285  proc_bind,
1286 #endif
1287  & this_thr->th.th_current_task->td_icvs,
1288  0 USE_NESTED_HOT_ARG(NULL) );
1289  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1290  KMP_ASSERT( new_team );
1291 
1292  /* setup new serialized team and install it */
1293  new_team->t.t_threads[0] = this_thr;
1294  new_team->t.t_parent = this_thr->th.th_team;
1295  serial_team = new_team;
1296  this_thr->th.th_serial_team = serial_team;
1297 
1298  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1299  global_tid, serial_team ) );
1300 
1301 
1302  /* TODO the above breaks the requirement that if we run out of
1303  * resources, then we can still guarantee that serialized teams
1304  * are ok, since we may need to allocate a new one */
1305  } else {
1306  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1307  global_tid, serial_team ) );
1308  }
1309 
1310  /* we have to initialize this serial team */
1311  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1312  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1313  KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1314  serial_team->t.t_ident = loc;
1315  serial_team->t.t_serialized = 1;
1316  serial_team->t.t_nproc = 1;
1317  serial_team->t.t_parent = this_thr->th.th_team;
1318  serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1319  this_thr->th.th_team = serial_team;
1320  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1321 
1322  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1323  global_tid, this_thr->th.th_current_task ) );
1324  KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1325  this_thr->th.th_current_task->td_flags.executing = 0;
1326 
1327  __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1328 
1329  /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
1330  each serialized task represented by team->t.t_serialized? */
1331  copy_icvs(
1332  & this_thr->th.th_current_task->td_icvs,
1333  & this_thr->th.th_current_task->td_parent->td_icvs );
1334 
1335  // Thread value exists in the nested nthreads array for the next nested level
1336  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1337  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1338  }
1339 
1340 #if OMP_40_ENABLED
1341  if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1342  this_thr->th.th_current_task->td_icvs.proc_bind
1343  = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1344  }
1345 #endif /* OMP_40_ENABLED */
1346 
1347 #if USE_DEBUGGER
1348  serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1349 #endif
1350  this_thr->th.th_info.ds.ds_tid = 0;
1351 
1352  /* set thread cache values */
1353  this_thr->th.th_team_nproc = 1;
1354  this_thr->th.th_team_master = this_thr;
1355  this_thr->th.th_team_serialized = 1;
1356 
1357  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1358  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1359 
1360  propagateFPControl (serial_team);
1361 
1362  /* check if we need to allocate dispatch buffers stack */
1363  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1364  if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1365  serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1366  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1367  }
1368  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1369 
1370 #if OMPT_SUPPORT
1371  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1372  __ompt_team_assign_id(serial_team, ompt_parallel_id);
1373 #endif
1374 
1375  KMP_MB();
1376 
1377  } else {
1378  /* this serialized team is already being used,
1379  * that's fine, just add another nested level */
1380  KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1381  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1382  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1383  ++ serial_team->t.t_serialized;
1384  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1385 
1386  // Nested level will be an index in the nested nthreads array
1387  int level = this_thr->th.th_team->t.t_level;
1388  // Thread value exists in the nested nthreads array for the next nested level
1389  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1390  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1391  }
1392  serial_team->t.t_level++;
1393  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1394  global_tid, serial_team, serial_team->t.t_level ) );
1395 
1396  /* allocate/push dispatch buffers stack */
1397  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1398  {
1399  dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1400  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1401  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1402  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1403  }
1404  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1405 
1406  KMP_MB();
1407  }
1408 
1409  if ( __kmp_env_consistency_check )
1410  __kmp_push_parallel( global_tid, NULL );
1411 
1412 #if USE_ITT_BUILD
1413  // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment
1414  if ( serial_team->t.t_level == 1
1415 #if OMP_40_ENABLED
1416  && this_thr->th.th_teams_microtask == NULL
1417 #endif
1418  ) {
1419 #if USE_ITT_NOTIFY
1420  // Save the start of the "parallel" region for VTune. This is the frame begin at the same time.
1421  if ( ( __itt_get_timestamp_ptr || KMP_ITT_DEBUG ) &&
1422  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
1423  {
1424  serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
1425  } else // only one notification scheme (either "submit" or "forking/joined", not both)
1426 #endif
1427  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
1428  __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode )
1429  {
1430  this_thr->th.th_ident = loc;
1431  // 0 - no barriers; 1 - serialized parallel
1432  __kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 );
1433  }
1434  }
1435 #endif /* USE_ITT_BUILD */
1436 }
1437 
1438 /* most of the work for a fork */
1439 /* return true if we really went parallel, false if serialized */
1440 int
1441 __kmp_fork_call(
1442  ident_t * loc,
1443  int gtid,
1444  enum fork_context_e call_context, // Intel, GNU, ...
1445  kmp_int32 argc,
1446 #if OMPT_SUPPORT
1447  void *unwrapped_task,
1448 #endif
1449  microtask_t microtask,
1450  launch_t invoker,
1451 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1452 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1453  va_list * ap
1454 #else
1455  va_list ap
1456 #endif
1457  )
1458 {
1459  void **argv;
1460  int i;
1461  int master_tid;
1462  int master_this_cons;
1463  kmp_team_t *team;
1464  kmp_team_t *parent_team;
1465  kmp_info_t *master_th;
1466  kmp_root_t *root;
1467  int nthreads;
1468  int master_active;
1469  int master_set_numthreads;
1470  int level;
1471 #if OMP_40_ENABLED
1472  int active_level;
1473  int teams_level;
1474 #endif
1475 #if KMP_NESTED_HOT_TEAMS
1476  kmp_hot_team_ptr_t **p_hot_teams;
1477 #endif
1478  { // KMP_TIME_BLOCK
1479  KMP_TIME_DEVELOPER_BLOCK(KMP_fork_call);
1480  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1481 
1482  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1483  if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
1484  /* Some systems prefer the stack for the root thread(s) to start with */
1485  /* some gap from the parent stack to prevent false sharing. */
1486  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1487  /* These 2 lines below are so this does not get optimized out */
1488  if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1489  __kmp_stkpadding += (short)((kmp_int64)dummy);
1490  }
1491 
1492  /* initialize if needed */
1493  KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1494  if( ! TCR_4(__kmp_init_parallel) )
1495  __kmp_parallel_initialize();
1496 
1497  /* setup current data */
1498  master_th = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1499  parent_team = master_th->th.th_team;
1500  master_tid = master_th->th.th_info.ds.ds_tid;
1501  master_this_cons = master_th->th.th_local.this_construct;
1502  root = master_th->th.th_root;
1503  master_active = root->r.r_active;
1504  master_set_numthreads = master_th->th.th_set_nproc;
1505 
1506 #if OMPT_SUPPORT
1507  ompt_parallel_id_t ompt_parallel_id;
1508  ompt_task_id_t ompt_task_id;
1509  ompt_frame_t *ompt_frame;
1510  ompt_task_id_t my_task_id;
1511  ompt_parallel_id_t my_parallel_id;
1512 
1513  if (ompt_enabled) {
1514  ompt_parallel_id = __ompt_parallel_id_new(gtid);
1515  ompt_task_id = __ompt_get_task_id_internal(0);
1516  ompt_frame = __ompt_get_task_frame_internal(0);
1517  }
1518 #endif
1519 
1520  // Nested level will be an index in the nested nthreads array
1521  level = parent_team->t.t_level;
1522 #if OMP_40_ENABLED
1523  active_level = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1524  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
1525 #endif
1526 #if KMP_NESTED_HOT_TEAMS
1527  p_hot_teams = &master_th->th.th_hot_teams;
1528  if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1529  *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1530  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1531  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1532  (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1533  }
1534 #endif
1535 
1536 #if OMPT_SUPPORT
1537  if (ompt_enabled &&
1538  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1539  int team_size = master_set_numthreads;
1540 
1541  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1542  ompt_task_id, ompt_frame, ompt_parallel_id,
1543  team_size, unwrapped_task, OMPT_INVOKER(call_context));
1544  }
1545 #endif
1546 
1547  master_th->th.th_ident = loc;
1548 
1549 #if OMP_40_ENABLED
1550  if ( master_th->th.th_teams_microtask &&
1551  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1552  // AC: This is start of parallel that is nested inside teams construct.
1553  // The team is actual (hot), all workers are ready at the fork barrier.
1554  // No lock needed to initialize the team a bit, then free workers.
1555  parent_team->t.t_ident = loc;
1556  parent_team->t.t_argc = argc;
1557  argv = (void**)parent_team->t.t_argv;
1558  for( i=argc-1; i >= 0; --i )
1559 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1560 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1561  *argv++ = va_arg( *ap, void * );
1562 #else
1563  *argv++ = va_arg( ap, void * );
1564 #endif
1565  /* Increment our nested depth levels, but not increase the serialization */
1566  if ( parent_team == master_th->th.th_serial_team ) {
1567  // AC: we are in serialized parallel
1568  __kmpc_serialized_parallel(loc, gtid);
1569  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1570  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1571  // work correctly, will restore at join time
1572 
1573 #if OMPT_SUPPORT
1574  void *dummy;
1575  void **exit_runtime_p;
1576 
1577  ompt_lw_taskteam_t lw_taskteam;
1578 
1579  if (ompt_enabled) {
1580  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1581  unwrapped_task, ompt_parallel_id);
1582  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1583  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1584 
1585  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1586 
1587 #if OMPT_TRACE
1588  /* OMPT implicit task begin */
1589  my_task_id = lw_taskteam.ompt_task_info.task_id;
1590  my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1591  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1592  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1593  my_parallel_id, my_task_id);
1594  }
1595 #endif
1596 
1597  /* OMPT state */
1598  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1599  } else {
1600  exit_runtime_p = &dummy;
1601  }
1602 #endif
1603 
1604  {
1605  KMP_TIME_BLOCK(OMP_work);
1606  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1607 #if OMPT_SUPPORT
1608  , exit_runtime_p
1609 #endif
1610  );
1611  }
1612 
1613 #if OMPT_SUPPORT
1614  if (ompt_enabled) {
1615 #if OMPT_TRACE
1616  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1617 
1618  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1619  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1620  ompt_parallel_id, ompt_task_id);
1621  }
1622 
1623  __ompt_lw_taskteam_unlink(master_th);
1624  // reset clear the task id only after unlinking the task
1625  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1626 #endif
1627 
1628  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1629  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1630  ompt_parallel_id, ompt_task_id,
1631  OMPT_INVOKER(call_context));
1632  }
1633  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1634  }
1635 #endif
1636  return TRUE;
1637  }
1638 
1639  parent_team->t.t_pkfn = microtask;
1640 #if OMPT_SUPPORT
1641  parent_team->t.ompt_team_info.microtask = unwrapped_task;
1642 #endif
1643  parent_team->t.t_invoke = invoker;
1644  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1645  parent_team->t.t_active_level ++;
1646  parent_team->t.t_level ++;
1647 
1648  /* Change number of threads in the team if requested */
1649  if ( master_set_numthreads ) { // The parallel has num_threads clause
1650  if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1651  // AC: only can reduce the number of threads dynamically, cannot increase
1652  kmp_info_t **other_threads = parent_team->t.t_threads;
1653  parent_team->t.t_nproc = master_set_numthreads;
1654  for ( i = 0; i < master_set_numthreads; ++i ) {
1655  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1656  }
1657  // Keep extra threads hot in the team for possible next parallels
1658  }
1659  master_th->th.th_set_nproc = 0;
1660  }
1661 
1662 #if USE_DEBUGGER
1663  if ( __kmp_debugging ) { // Let debugger override number of threads.
1664  int nth = __kmp_omp_num_threads( loc );
1665  if ( nth > 0 ) { // 0 means debugger does not want to change number of threads.
1666  master_set_numthreads = nth;
1667  }; // if
1668  }; // if
1669 #endif
1670 
1671  KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1672  __kmp_internal_fork( loc, gtid, parent_team );
1673  KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1674 
1675  /* Invoke microtask for MASTER thread */
1676  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1677  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1678 
1679  {
1680  KMP_TIME_BLOCK(OMP_work);
1681  if (! parent_team->t.t_invoke( gtid )) {
1682  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1683  }
1684  }
1685  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1686  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1687  KMP_MB(); /* Flush all pending memory write invalidates. */
1688 
1689  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1690 
1691  return TRUE;
1692  } // Parallel closely nested in teams construct
1693 #endif /* OMP_40_ENABLED */
1694 
1695 #if KMP_DEBUG
1696  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1697  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1698  }
1699 #endif
1700 
1701  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1702  nthreads = 1;
1703  } else {
1704 #if OMP_40_ENABLED
1705  int enter_teams = ((ap==NULL && active_level==0)||(ap && teams_level>0 && teams_level==level));
1706 #endif
1707  nthreads = master_set_numthreads ?
1708  master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1709 
1710  // Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
1711  // This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
1712  if (nthreads > 1) {
1713  if ( ( !get__nested(master_th) && (root->r.r_in_parallel
1714 #if OMP_40_ENABLED
1715  && !enter_teams
1716 #endif /* OMP_40_ENABLED */
1717  ) ) || ( __kmp_library == library_serial ) ) {
1718  KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1719  gtid, nthreads ));
1720  nthreads = 1;
1721  }
1722  }
1723  if ( nthreads > 1 ) {
1724  /* determine how many new threads we can use */
1725  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1726 
1727  nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1728 #if OMP_40_ENABLED
1729 /* AC: If we execute teams from parallel region (on host), then teams should be created
1730  but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1731  then teams and their threads should be created regardless of the nesting setting. */
1732  , enter_teams
1733 #endif /* OMP_40_ENABLED */
1734  );
1735  if ( nthreads == 1 ) {
1736  // Free lock for single thread execution here;
1737  // for multi-thread execution it will be freed later
1738  // after team of threads created and initialized
1739  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1740  }
1741  }
1742  }
1743  KMP_DEBUG_ASSERT( nthreads > 0 );
1744 
1745  /* If we temporarily changed the set number of threads then restore it now */
1746  master_th->th.th_set_nproc = 0;
1747 
1748  /* create a serialized parallel region? */
1749  if ( nthreads == 1 ) {
1750  /* josh todo: hypothetical question: what do we do for OS X*? */
1751 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1752  void * args[ argc ];
1753 #else
1754  void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1755 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1756 
1757  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1758 
1759  __kmpc_serialized_parallel(loc, gtid);
1760 
1761  if ( call_context == fork_context_intel ) {
1762  /* TODO this sucks, use the compiler itself to pass args! :) */
1763  master_th->th.th_serial_team->t.t_ident = loc;
1764 #if OMP_40_ENABLED
1765  if ( !ap ) {
1766  // revert change made in __kmpc_serialized_parallel()
1767  master_th->th.th_serial_team->t.t_level--;
1768  // Get args from parent team for teams construct
1769 
1770 #if OMPT_SUPPORT
1771  void *dummy;
1772  void **exit_runtime_p;
1773 
1774  ompt_lw_taskteam_t lw_taskteam;
1775 
1776  if (ompt_enabled) {
1777  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1778  unwrapped_task, ompt_parallel_id);
1779  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1780  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1781 
1782  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1783 
1784 #if OMPT_TRACE
1785  my_task_id = lw_taskteam.ompt_task_info.task_id;
1786  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1787  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1788  ompt_parallel_id, my_task_id);
1789  }
1790 #endif
1791 
1792  /* OMPT state */
1793  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1794  } else {
1795  exit_runtime_p = &dummy;
1796  }
1797 #endif
1798 
1799  {
1800  KMP_TIME_BLOCK(OMP_work);
1801  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1802 #if OMPT_SUPPORT
1803  , exit_runtime_p
1804 #endif
1805  );
1806  }
1807 
1808 #if OMPT_SUPPORT
1809  if (ompt_enabled) {
1810  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1811 
1812 #if OMPT_TRACE
1813  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1814  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1815  ompt_parallel_id, ompt_task_id);
1816  }
1817 #endif
1818 
1819  __ompt_lw_taskteam_unlink(master_th);
1820  // reset clear the task id only after unlinking the task
1821  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1822 
1823  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1824  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1825  ompt_parallel_id, ompt_task_id,
1826  OMPT_INVOKER(call_context));
1827  }
1828  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1829  }
1830 #endif
1831  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1832  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1833  team = master_th->th.th_team;
1834  //team->t.t_pkfn = microtask;
1835  team->t.t_invoke = invoker;
1836  __kmp_alloc_argv_entries( argc, team, TRUE );
1837  team->t.t_argc = argc;
1838  argv = (void**) team->t.t_argv;
1839  if ( ap ) {
1840  for( i=argc-1; i >= 0; --i )
1841 // TODO: revert workaround for Intel(R) 64 tracker #96
1842 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1843  *argv++ = va_arg( *ap, void * );
1844 # else
1845  *argv++ = va_arg( ap, void * );
1846 # endif
1847  } else {
1848  for( i=0; i < argc; ++i )
1849  // Get args from parent team for teams construct
1850  argv[i] = parent_team->t.t_argv[i];
1851  }
1852  // AC: revert change made in __kmpc_serialized_parallel()
1853  // because initial code in teams should have level=0
1854  team->t.t_level--;
1855  // AC: call special invoker for outer "parallel" of the teams construct
1856  {
1857  KMP_TIME_BLOCK(OMP_work);
1858  invoker(gtid);
1859  }
1860  } else {
1861 #endif /* OMP_40_ENABLED */
1862  argv = args;
1863  for( i=argc-1; i >= 0; --i )
1864 // TODO: revert workaround for Intel(R) 64 tracker #96
1865 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1866  *argv++ = va_arg( *ap, void * );
1867 #else
1868  *argv++ = va_arg( ap, void * );
1869 #endif
1870  KMP_MB();
1871 
1872 #if OMPT_SUPPORT
1873  void *dummy;
1874  void **exit_runtime_p;
1875 
1876  ompt_lw_taskteam_t lw_taskteam;
1877 
1878  if (ompt_enabled) {
1879  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1880  unwrapped_task, ompt_parallel_id);
1881  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1882  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1883 
1884  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1885 
1886 #if OMPT_TRACE
1887  /* OMPT implicit task begin */
1888  my_task_id = lw_taskteam.ompt_task_info.task_id;
1889  my_parallel_id = ompt_parallel_id;
1890  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1891  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1892  my_parallel_id, my_task_id);
1893  }
1894 #endif
1895 
1896  /* OMPT state */
1897  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1898  } else {
1899  exit_runtime_p = &dummy;
1900  }
1901 #endif
1902 
1903  {
1904  KMP_TIME_BLOCK(OMP_work);
1905  __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1906 #if OMPT_SUPPORT
1907  , exit_runtime_p
1908 #endif
1909  );
1910  }
1911 
1912 #if OMPT_SUPPORT
1913  if (ompt_enabled) {
1914 #if OMPT_TRACE
1915  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1916 
1917  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1918  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1919  my_parallel_id, my_task_id);
1920  }
1921 #endif
1922 
1923  __ompt_lw_taskteam_unlink(master_th);
1924  // reset clear the task id only after unlinking the task
1925  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1926 
1927  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1928  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1929  ompt_parallel_id, ompt_task_id,
1930  OMPT_INVOKER(call_context));
1931  }
1932  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1933  }
1934 #endif
1935 #if OMP_40_ENABLED
1936  }
1937 #endif /* OMP_40_ENABLED */
1938  }
1939  else if ( call_context == fork_context_gnu ) {
1940 #if OMPT_SUPPORT
1941  ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1942  __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1943  __ompt_lw_taskteam_init(lwt, master_th, gtid,
1944  unwrapped_task, ompt_parallel_id);
1945 
1946  lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1947  lwt->ompt_task_info.frame.exit_runtime_frame = 0;
1948  __ompt_lw_taskteam_link(lwt, master_th);
1949 #endif
1950 
1951  // we were called from GNU native code
1952  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1953  return FALSE;
1954  }
1955  else {
1956  KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1957  }
1958 
1959 
1960  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1961  KMP_MB();
1962  return FALSE;
1963  }
1964 
1965  // GEH: only modify the executing flag in the case when not serialized
1966  // serialized case is handled in kmpc_serialized_parallel
1967  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1968  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1969  master_th->th.th_current_task->td_icvs.max_active_levels ) );
1970  // TODO: GEH - cannot do this assertion because root thread not set up as executing
1971  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1972  master_th->th.th_current_task->td_flags.executing = 0;
1973 
1974 #if OMP_40_ENABLED
1975  if ( !master_th->th.th_teams_microtask || level > teams_level )
1976 #endif /* OMP_40_ENABLED */
1977  {
1978  /* Increment our nested depth level */
1979  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1980  }
1981 
1982  // See if we need to make a copy of the ICVs.
1983  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1984  if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1985  nthreads_icv = __kmp_nested_nth.nth[level+1];
1986  }
1987  else {
1988  nthreads_icv = 0; // don't update
1989  }
1990 
1991 #if OMP_40_ENABLED
1992  // Figure out the proc_bind_policy for the new team.
1993  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1994  kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1995  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1996  proc_bind = proc_bind_false;
1997  }
1998  else {
1999  if (proc_bind == proc_bind_default) {
2000  // No proc_bind clause specified; use current proc-bind-var for this parallel region
2001  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2002  }
2003  /* else: The proc_bind policy was specified explicitly on parallel clause. This
2004  overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
2005  // Figure the value of proc-bind-var for the child threads.
2006  if ((level+1 < __kmp_nested_proc_bind.used)
2007  && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
2008  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
2009  }
2010  }
2011 
2012  // Reset for next parallel region
2013  master_th->th.th_set_proc_bind = proc_bind_default;
2014 #endif /* OMP_40_ENABLED */
2015 
2016  if ((nthreads_icv > 0)
2017 #if OMP_40_ENABLED
2018  || (proc_bind_icv != proc_bind_default)
2019 #endif /* OMP_40_ENABLED */
2020  ) {
2021  kmp_internal_control_t new_icvs;
2022  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2023  new_icvs.next = NULL;
2024  if (nthreads_icv > 0) {
2025  new_icvs.nproc = nthreads_icv;
2026  }
2027 
2028 #if OMP_40_ENABLED
2029  if (proc_bind_icv != proc_bind_default) {
2030  new_icvs.proc_bind = proc_bind_icv;
2031  }
2032 #endif /* OMP_40_ENABLED */
2033 
2034  /* allocate a new parallel team */
2035  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2036  team = __kmp_allocate_team(root, nthreads, nthreads,
2037 #if OMPT_SUPPORT
2038  ompt_parallel_id,
2039 #endif
2040 #if OMP_40_ENABLED
2041  proc_bind,
2042 #endif
2043  &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
2044  } else {
2045  /* allocate a new parallel team */
2046  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2047  team = __kmp_allocate_team(root, nthreads, nthreads,
2048 #if OMPT_SUPPORT
2049  ompt_parallel_id,
2050 #endif
2051 #if OMP_40_ENABLED
2052  proc_bind,
2053 #endif
2054  &master_th->th.th_current_task->td_icvs, argc
2055  USE_NESTED_HOT_ARG(master_th) );
2056  }
2057  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
2058 
2059  /* setup the new team */
2060  team->t.t_master_tid = master_tid;
2061  team->t.t_master_this_cons = master_this_cons;
2062  team->t.t_ident = loc;
2063  team->t.t_parent = parent_team;
2064  TCW_SYNC_PTR(team->t.t_pkfn, microtask);
2065 #if OMPT_SUPPORT
2066  TCW_SYNC_PTR(team->t.ompt_team_info.microtask, unwrapped_task);
2067 #endif
2068  team->t.t_invoke = invoker; /* TODO move this to root, maybe */
2069  // TODO: parent_team->t.t_level == INT_MAX ???
2070 #if OMP_40_ENABLED
2071  if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2072 #endif /* OMP_40_ENABLED */
2073  team->t.t_level = parent_team->t.t_level + 1;
2074  team->t.t_active_level = parent_team->t.t_active_level + 1;
2075 #if OMP_40_ENABLED
2076  } else {
2077  // AC: Do not increase parallel level at start of the teams construct
2078  team->t.t_level = parent_team->t.t_level;
2079  team->t.t_active_level = parent_team->t.t_active_level;
2080  }
2081 #endif /* OMP_40_ENABLED */
2082  team->t.t_sched = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule
2083 
2084 #if OMP_40_ENABLED
2085  team->t.t_cancel_request = cancel_noreq;
2086 #endif
2087 
2088  // Update the floating point rounding in the team if required.
2089  propagateFPControl(team);
2090 
2091  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2092  // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2093 #if 0
2094  // Patch out an assertion that trips while the runtime seems to operate correctly.
2095  // Avoiding the preconditions that cause the assertion to trip has been promised as a forthcoming patch.
2096  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2097 #endif
2098  KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2099  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2100  parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2101  if (level) {
2102  // Take a memo of master's task_state
2103  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2104  if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2105  kmp_uint32 new_size = 2*master_th->th.th_task_state_stack_sz;
2106  kmp_uint8 *old_stack, *new_stack;
2107  kmp_uint32 i;
2108  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2109  for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2110  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2111  }
2112  for (i=master_th->th.th_task_state_stack_sz; i<new_size; ++i) { // zero-init rest of stack
2113  new_stack[i] = 0;
2114  }
2115  old_stack = master_th->th.th_task_state_memo_stack;
2116  master_th->th.th_task_state_memo_stack = new_stack;
2117  master_th->th.th_task_state_stack_sz = new_size;
2118  __kmp_free(old_stack);
2119  }
2120  // Store master's task_state on stack
2121  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2122  master_th->th.th_task_state_top++;
2123 #if KMP_NESTED_HOT_TEAMS
2124  if (team == master_th->th.th_hot_teams[level].hot_team) { // Restore master's nested state if nested hot team
2125  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2126  }
2127  else {
2128 #endif
2129  master_th->th.th_task_state = 0;
2130 #if KMP_NESTED_HOT_TEAMS
2131  }
2132 #endif
2133  }
2134 #if !KMP_NESTED_HOT_TEAMS
2135  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2136 #endif
2137  }
2138 
2139  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2140  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2141  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2142  ( team->t.t_master_tid == 0 &&
2143  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2144  KMP_MB();
2145 
2146  /* now, setup the arguments */
2147  argv = (void**)team->t.t_argv;
2148 #if OMP_40_ENABLED
2149  if ( ap ) {
2150 #endif /* OMP_40_ENABLED */
2151  for ( i=argc-1; i >= 0; --i )
2152 // TODO: revert workaround for Intel(R) 64 tracker #96
2153 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2154  *argv++ = va_arg( *ap, void * );
2155 #else
2156  *argv++ = va_arg( ap, void * );
2157 #endif
2158 #if OMP_40_ENABLED
2159  } else {
2160  for ( i=0; i < argc; ++i )
2161  // Get args from parent team for teams construct
2162  argv[i] = team->t.t_parent->t.t_argv[i];
2163  }
2164 #endif /* OMP_40_ENABLED */
2165 
2166  /* now actually fork the threads */
2167  team->t.t_master_active = master_active;
2168  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2169  root->r.r_active = TRUE;
2170 
2171  __kmp_fork_team_threads( root, team, master_th, gtid );
2172  __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2173 
2174 #if OMPT_SUPPORT
2175  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2176 #endif
2177 
2178  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2179 
2180 
2181 #if USE_ITT_BUILD
2182  if ( team->t.t_active_level == 1 // only report frames at level 1
2183 # if OMP_40_ENABLED
2184  && !master_th->th.th_teams_microtask // not in teams construct
2185 # endif /* OMP_40_ENABLED */
2186  ) {
2187 #if USE_ITT_NOTIFY
2188  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2189  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2190  {
2191  kmp_uint64 tmp_time = 0;
2192  if ( __itt_get_timestamp_ptr )
2193  tmp_time = __itt_get_timestamp();
2194  // Internal fork - report frame begin
2195  master_th->th.th_frame_time = tmp_time;
2196  if ( __kmp_forkjoin_frames_mode == 3 )
2197  team->t.t_region_time = tmp_time;
2198  } else // only one notification scheme (either "submit" or "forking/joined", not both)
2199 #endif /* USE_ITT_NOTIFY */
2200  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2201  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2202  { // Mark start of "parallel" region for VTune.
2203  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2204  }
2205  }
2206 #endif /* USE_ITT_BUILD */
2207 
2208  /* now go on and do the work */
2209  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2210  KMP_MB();
2211  KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2212  root, team, master_th, gtid));
2213 
2214 #if USE_ITT_BUILD
2215  if ( __itt_stack_caller_create_ptr ) {
2216  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2217  }
2218 #endif /* USE_ITT_BUILD */
2219 
2220 #if OMP_40_ENABLED
2221  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2222 #endif /* OMP_40_ENABLED */
2223  {
2224  __kmp_internal_fork( loc, gtid, team );
2225  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2226  root, team, master_th, gtid));
2227  }
2228 
2229  if (call_context == fork_context_gnu) {
2230  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2231  return TRUE;
2232  }
2233 
2234  /* Invoke microtask for MASTER thread */
2235  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2236  gtid, team->t.t_id, team->t.t_pkfn ) );
2237  } // END of timer KMP_fork_call block
2238 
2239  {
2240  KMP_TIME_BLOCK(OMP_work);
2241  // KMP_TIME_DEVELOPER_BLOCK(USER_master_invoke);
2242  if (! team->t.t_invoke( gtid )) {
2243  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2244  }
2245  }
2246  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2247  gtid, team->t.t_id, team->t.t_pkfn ) );
2248  KMP_MB(); /* Flush all pending memory write invalidates. */
2249 
2250  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2251 
2252 #if OMPT_SUPPORT
2253  if (ompt_enabled) {
2254  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2255  }
2256 #endif
2257 
2258  return TRUE;
2259 }
2260 
2261 #if OMPT_SUPPORT
2262 static inline void
2263 __kmp_join_restore_state(
2264  kmp_info_t *thread,
2265  kmp_team_t *team)
2266 {
2267  // restore state outside the region
2268  thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2269  ompt_state_work_serial : ompt_state_work_parallel);
2270 }
2271 
2272 static inline void
2273 __kmp_join_ompt(
2274  kmp_info_t *thread,
2275  kmp_team_t *team,
2276  ompt_parallel_id_t parallel_id,
2277  fork_context_e fork_context)
2278 {
2279  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2280  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2281  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2282  parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2283  }
2284 
2285  __kmp_join_restore_state(thread,team);
2286 }
2287 #endif
2288 
2289 void
2290 __kmp_join_call(ident_t *loc, int gtid
2291 #if OMPT_SUPPORT
2292  , enum fork_context_e fork_context
2293 #endif
2294 #if OMP_40_ENABLED
2295  , int exit_teams
2296 #endif /* OMP_40_ENABLED */
2297 )
2298 {
2299  KMP_TIME_DEVELOPER_BLOCK(KMP_join_call);
2300  kmp_team_t *team;
2301  kmp_team_t *parent_team;
2302  kmp_info_t *master_th;
2303  kmp_root_t *root;
2304  int master_active;
2305  int i;
2306 
2307  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2308 
2309  /* setup current data */
2310  master_th = __kmp_threads[ gtid ];
2311  root = master_th->th.th_root;
2312  team = master_th->th.th_team;
2313  parent_team = team->t.t_parent;
2314 
2315  master_th->th.th_ident = loc;
2316 
2317 #if OMPT_SUPPORT
2318  if (ompt_enabled) {
2319  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2320  }
2321 #endif
2322 
2323 #if KMP_DEBUG
2324  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2325  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2326  __kmp_gtid_from_thread( master_th ), team,
2327  team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2328  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2329  }
2330 #endif
2331 
2332  if( team->t.t_serialized ) {
2333 #if OMP_40_ENABLED
2334  if ( master_th->th.th_teams_microtask ) {
2335  // We are in teams construct
2336  int level = team->t.t_level;
2337  int tlevel = master_th->th.th_teams_level;
2338  if ( level == tlevel ) {
2339  // AC: we haven't incremented it earlier at start of teams construct,
2340  // so do it here - at the end of teams construct
2341  team->t.t_level++;
2342  } else if ( level == tlevel + 1 ) {
2343  // AC: we are exiting parallel inside teams, need to increment serialization
2344  // in order to restore it in the next call to __kmpc_end_serialized_parallel
2345  team->t.t_serialized++;
2346  }
2347  }
2348 #endif /* OMP_40_ENABLED */
2349  __kmpc_end_serialized_parallel( loc, gtid );
2350 
2351 #if OMPT_SUPPORT
2352  if (ompt_enabled) {
2353  __kmp_join_restore_state(master_th, parent_team);
2354  }
2355 #endif
2356 
2357  return;
2358  }
2359 
2360  master_active = team->t.t_master_active;
2361 
2362 #if OMP_40_ENABLED
2363  if (!exit_teams)
2364 #endif /* OMP_40_ENABLED */
2365  {
2366  // AC: No barrier for internal teams at exit from teams construct.
2367  // But there is barrier for external team (league).
2368  __kmp_internal_join( loc, gtid, team );
2369  }
2370 #if OMP_40_ENABLED
2371  else {
2372  master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2373  }
2374 #endif /* OMP_40_ENABLED */
2375 
2376  KMP_MB();
2377 
2378 #if OMPT_SUPPORT
2379  ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2380 #endif
2381 
2382 #if USE_ITT_BUILD
2383  if ( __itt_stack_caller_create_ptr ) {
2384  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2385  }
2386 
2387  // Mark end of "parallel" region for VTune.
2388  if ( team->t.t_active_level == 1
2389 # if OMP_40_ENABLED
2390  && !master_th->th.th_teams_microtask /* not in teams construct */
2391 # endif /* OMP_40_ENABLED */
2392  ) {
2393  master_th->th.th_ident = loc;
2394  // only one notification scheme (either "submit" or "forking/joined", not both)
2395  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2396  __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2397  0, loc, master_th->th.th_team_nproc, 1 );
2398  else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2399  ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2400  __kmp_itt_region_joined( gtid );
2401  } // active_level == 1
2402 #endif /* USE_ITT_BUILD */
2403 
2404 #if OMP_40_ENABLED
2405  if ( master_th->th.th_teams_microtask &&
2406  !exit_teams &&
2407  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2408  team->t.t_level == master_th->th.th_teams_level + 1 ) {
2409  // AC: We need to leave the team structure intact at the end
2410  // of parallel inside the teams construct, so that at the next
2411  // parallel same (hot) team works, only adjust nesting levels
2412 
2413  /* Decrement our nested depth level */
2414  team->t.t_level --;
2415  team->t.t_active_level --;
2416  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2417 
2418  /* Restore number of threads in the team if needed */
2419  if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2420  int old_num = master_th->th.th_team_nproc;
2421  int new_num = master_th->th.th_teams_size.nth;
2422  kmp_info_t **other_threads = team->t.t_threads;
2423  team->t.t_nproc = new_num;
2424  for ( i = 0; i < old_num; ++i ) {
2425  other_threads[i]->th.th_team_nproc = new_num;
2426  }
2427  // Adjust states of non-used threads of the team
2428  for ( i = old_num; i < new_num; ++i ) {
2429  // Re-initialize thread's barrier data.
2430  int b;
2431  kmp_balign_t * balign = other_threads[i]->th.th_bar;
2432  for ( b = 0; b < bs_last_barrier; ++ b ) {
2433  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2434  KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2435 #if USE_DEBUGGER
2436  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2437 #endif
2438  }
2439  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2440  // Synchronize thread's task state
2441  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2442  }
2443  }
2444  }
2445 
2446 #if OMPT_SUPPORT
2447  if (ompt_enabled) {
2448  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2449  }
2450 #endif
2451 
2452  return;
2453  }
2454 #endif /* OMP_40_ENABLED */
2455 
2456  /* do cleanup and restore the parent team */
2457  master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2458  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2459 
2460  master_th->th.th_dispatch =
2461  & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2462 
2463  /* jc: The following lock has instructions with REL and ACQ semantics,
2464  separating the parallel user code called in this parallel region
2465  from the serial user code called after this function returns.
2466  */
2467  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2468 
2469 #if OMP_40_ENABLED
2470  if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2471 #endif /* OMP_40_ENABLED */
2472  {
2473  /* Decrement our nested depth level */
2474  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2475  }
2476  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2477 
2478  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2479  0, master_th, team ) );
2480  __kmp_pop_current_task_from_thread( master_th );
2481 
2482 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2483  //
2484  // Restore master thread's partition.
2485  //
2486  master_th->th.th_first_place = team->t.t_first_place;
2487  master_th->th.th_last_place = team->t.t_last_place;
2488 #endif /* OMP_40_ENABLED */
2489 
2490  updateHWFPControl (team);
2491 
2492  if ( root->r.r_active != master_active )
2493  root->r.r_active = master_active;
2494 
2495  __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2496 
2497  /* this race was fun to find. make sure the following is in the critical
2498  * region otherwise assertions may fail occasionally since the old team
2499  * may be reallocated and the hierarchy appears inconsistent. it is
2500  * actually safe to run and won't cause any bugs, but will cause those
2501  * assertion failures. it's only one deref&assign so might as well put this
2502  * in the critical region */
2503  master_th->th.th_team = parent_team;
2504  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2505  master_th->th.th_team_master = parent_team->t.t_threads[0];
2506  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2507 
2508  /* restore serialized team, if need be */
2509  if( parent_team->t.t_serialized &&
2510  parent_team != master_th->th.th_serial_team &&
2511  parent_team != root->r.r_root_team ) {
2512  __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2513  master_th->th.th_serial_team = parent_team;
2514  }
2515 
2516  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2517  if (master_th->th.th_task_state_top > 0) { // Restore task state from memo stack
2518  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2519  // Remember master's state if we re-use this nested hot team
2520  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2521  --master_th->th.th_task_state_top; // pop
2522  // Now restore state at this level
2523  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2524  }
2525  // Copy the task team from the parent team to the master thread
2526  master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2527  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2528  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team, parent_team ) );
2529  }
2530 
2531  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2532  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2533  master_th->th.th_current_task->td_flags.executing = 1;
2534 
2535  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2536 
2537 #if OMPT_SUPPORT
2538  if (ompt_enabled) {
2539  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2540  }
2541 #endif
2542 
2543  KMP_MB();
2544  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2545 }
2546 
2547 /* ------------------------------------------------------------------------ */
2548 /* ------------------------------------------------------------------------ */
2549 
2550 /* Check whether we should push an internal control record onto the
2551  serial team stack. If so, do it. */
2552 void
2553 __kmp_save_internal_controls ( kmp_info_t * thread )
2554 {
2555 
2556  if ( thread->th.th_team != thread->th.th_serial_team ) {
2557  return;
2558  }
2559  if (thread->th.th_team->t.t_serialized > 1) {
2560  int push = 0;
2561 
2562  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2563  push = 1;
2564  } else {
2565  if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2566  thread->th.th_team->t.t_serialized ) {
2567  push = 1;
2568  }
2569  }
2570  if (push) { /* push a record on the serial team's stack */
2571  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2572 
2573  copy_icvs( control, & thread->th.th_current_task->td_icvs );
2574 
2575  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2576 
2577  control->next = thread->th.th_team->t.t_control_stack_top;
2578  thread->th.th_team->t.t_control_stack_top = control;
2579  }
2580  }
2581 }
2582 
2583 /* Changes set_nproc */
2584 void
2585 __kmp_set_num_threads( int new_nth, int gtid )
2586 {
2587  kmp_info_t *thread;
2588  kmp_root_t *root;
2589 
2590  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2591  KMP_DEBUG_ASSERT( __kmp_init_serial );
2592 
2593  if (new_nth < 1)
2594  new_nth = 1;
2595  else if (new_nth > __kmp_max_nth)
2596  new_nth = __kmp_max_nth;
2597 
2598  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2599  thread = __kmp_threads[gtid];
2600 
2601  __kmp_save_internal_controls( thread );
2602 
2603  set__nproc( thread, new_nth );
2604 
2605  //
2606  // If this omp_set_num_threads() call will cause the hot team size to be
2607  // reduced (in the absence of a num_threads clause), then reduce it now,
2608  // rather than waiting for the next parallel region.
2609  //
2610  root = thread->th.th_root;
2611  if ( __kmp_init_parallel && ( ! root->r.r_active )
2612  && ( root->r.r_hot_team->t.t_nproc > new_nth )
2613 #if KMP_NESTED_HOT_TEAMS
2614  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2615 #endif
2616  ) {
2617  kmp_team_t *hot_team = root->r.r_hot_team;
2618  int f;
2619 
2620  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2621 
2622 
2623  // Release the extra threads we don't need any more.
2624  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
2625  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2626  if ( __kmp_tasking_mode != tskm_immediate_exec) {
2627  // When decreasing team size, threads no longer in the team should unref task team.
2628  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2629  }
2630  __kmp_free_thread( hot_team->t.t_threads[f] );
2631  hot_team->t.t_threads[f] = NULL;
2632  }
2633  hot_team->t.t_nproc = new_nth;
2634 #if KMP_NESTED_HOT_TEAMS
2635  if( thread->th.th_hot_teams ) {
2636  KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2637  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2638  }
2639 #endif
2640 
2641 
2642  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2643 
2644  //
2645  // Update the t_nproc field in the threads that are still active.
2646  //
2647  for( f=0 ; f < new_nth; f++ ) {
2648  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2649  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2650  }
2651  // Special flag in case omp_set_num_threads() call
2652  hot_team->t.t_size_changed = -1;
2653  }
2654 }
2655 
2656 /* Changes max_active_levels */
2657 void
2658 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2659 {
2660  kmp_info_t *thread;
2661 
2662  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2663  KMP_DEBUG_ASSERT( __kmp_init_serial );
2664 
2665  // validate max_active_levels
2666  if( max_active_levels < 0 ) {
2667  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2668  // We ignore this call if the user has specified a negative value.
2669  // The current setting won't be changed. The last valid setting will be used.
2670  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2671  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2672  return;
2673  }
2674  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2675  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2676  // We allow a zero value. (implementation defined behavior)
2677  } else {
2678  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
2679  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2680  // Current upper limit is MAX_INT. (implementation defined behavior)
2681  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2682  // Actually, the flow should never get here until we use MAX_INT limit.
2683  }
2684  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2685 
2686  thread = __kmp_threads[ gtid ];
2687 
2688  __kmp_save_internal_controls( thread );
2689 
2690  set__max_active_levels( thread, max_active_levels );
2691 
2692 }
2693 
2694 /* Gets max_active_levels */
2695 int
2696 __kmp_get_max_active_levels( int gtid )
2697 {
2698  kmp_info_t *thread;
2699 
2700  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2701  KMP_DEBUG_ASSERT( __kmp_init_serial );
2702 
2703  thread = __kmp_threads[ gtid ];
2704  KMP_DEBUG_ASSERT( thread->th.th_current_task );
2705  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2706  gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2707  return thread->th.th_current_task->td_icvs.max_active_levels;
2708 }
2709 
2710 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2711 void
2712 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2713 {
2714  kmp_info_t *thread;
2715 // kmp_team_t *team;
2716 
2717  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2718  KMP_DEBUG_ASSERT( __kmp_init_serial );
2719 
2720  // Check if the kind parameter is valid, correct if needed.
2721  // Valid parameters should fit in one of two intervals - standard or extended:
2722  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2723  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2724  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2725  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2726  {
2727  // TODO: Hint needs attention in case we change the default schedule.
2728  __kmp_msg(
2729  kmp_ms_warning,
2730  KMP_MSG( ScheduleKindOutOfRange, kind ),
2731  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2732  __kmp_msg_null
2733  );
2734  kind = kmp_sched_default;
2735  chunk = 0; // ignore chunk value in case of bad kind
2736  }
2737 
2738  thread = __kmp_threads[ gtid ];
2739 
2740  __kmp_save_internal_controls( thread );
2741 
2742  if ( kind < kmp_sched_upper_std ) {
2743  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2744  // differ static chunked vs. unchunked:
2745  // chunk should be invalid to indicate unchunked schedule (which is the default)
2746  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2747  } else {
2748  thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2749  }
2750  } else {
2751  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2752  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2753  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2754  }
2755  if ( kind == kmp_sched_auto ) {
2756  // ignore parameter chunk for schedule auto
2757  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2758  } else {
2759  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2760  }
2761 }
2762 
2763 /* Gets def_sched_var ICV values */
2764 void
2765 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2766 {
2767  kmp_info_t *thread;
2768  enum sched_type th_type;
2769 
2770  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2771  KMP_DEBUG_ASSERT( __kmp_init_serial );
2772 
2773  thread = __kmp_threads[ gtid ];
2774 
2775  //th_type = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
2776  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2777 
2778  switch ( th_type ) {
2779  case kmp_sch_static:
2780  case kmp_sch_static_greedy:
2781  case kmp_sch_static_balanced:
2782  *kind = kmp_sched_static;
2783  *chunk = 0; // chunk was not set, try to show this fact via zero value
2784  return;
2785  case kmp_sch_static_chunked:
2786  *kind = kmp_sched_static;
2787  break;
2788  case kmp_sch_dynamic_chunked:
2789  *kind = kmp_sched_dynamic;
2790  break;
2792  case kmp_sch_guided_iterative_chunked:
2793  case kmp_sch_guided_analytical_chunked:
2794  *kind = kmp_sched_guided;
2795  break;
2796  case kmp_sch_auto:
2797  *kind = kmp_sched_auto;
2798  break;
2799  case kmp_sch_trapezoidal:
2800  *kind = kmp_sched_trapezoidal;
2801  break;
2802 /*
2803  case kmp_sch_static_steal:
2804  *kind = kmp_sched_static_steal;
2805  break;
2806 */
2807  default:
2808  KMP_FATAL( UnknownSchedulingType, th_type );
2809  }
2810 
2811  //*chunk = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
2812  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2813 }
2814 
2815 int
2816 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2817 
2818  int ii, dd;
2819  kmp_team_t *team;
2820  kmp_info_t *thr;
2821 
2822  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2823  KMP_DEBUG_ASSERT( __kmp_init_serial );
2824 
2825  // validate level
2826  if( level == 0 ) return 0;
2827  if( level < 0 ) return -1;
2828  thr = __kmp_threads[ gtid ];
2829  team = thr->th.th_team;
2830  ii = team->t.t_level;
2831  if( level > ii ) return -1;
2832 
2833 #if OMP_40_ENABLED
2834  if( thr->th.th_teams_microtask ) {
2835  // AC: we are in teams region where multiple nested teams have same level
2836  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2837  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2838  KMP_DEBUG_ASSERT( ii >= tlevel );
2839  // AC: As we need to pass by the teams league, we need to artificially increase ii
2840  if ( ii == tlevel ) {
2841  ii += 2; // three teams have same level
2842  } else {
2843  ii ++; // two teams have same level
2844  }
2845  }
2846  }
2847 #endif
2848 
2849  if( ii == level ) return __kmp_tid_from_gtid( gtid );
2850 
2851  dd = team->t.t_serialized;
2852  level++;
2853  while( ii > level )
2854  {
2855  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2856  {
2857  }
2858  if( ( team->t.t_serialized ) && ( !dd ) ) {
2859  team = team->t.t_parent;
2860  continue;
2861  }
2862  if( ii > level ) {
2863  team = team->t.t_parent;
2864  dd = team->t.t_serialized;
2865  ii--;
2866  }
2867  }
2868 
2869  return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2870 }
2871 
2872 int
2873 __kmp_get_team_size( int gtid, int level ) {
2874 
2875  int ii, dd;
2876  kmp_team_t *team;
2877  kmp_info_t *thr;
2878 
2879  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2880  KMP_DEBUG_ASSERT( __kmp_init_serial );
2881 
2882  // validate level
2883  if( level == 0 ) return 1;
2884  if( level < 0 ) return -1;
2885  thr = __kmp_threads[ gtid ];
2886  team = thr->th.th_team;
2887  ii = team->t.t_level;
2888  if( level > ii ) return -1;
2889 
2890 #if OMP_40_ENABLED
2891  if( thr->th.th_teams_microtask ) {
2892  // AC: we are in teams region where multiple nested teams have same level
2893  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2894  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2895  KMP_DEBUG_ASSERT( ii >= tlevel );
2896  // AC: As we need to pass by the teams league, we need to artificially increase ii
2897  if ( ii == tlevel ) {
2898  ii += 2; // three teams have same level
2899  } else {
2900  ii ++; // two teams have same level
2901  }
2902  }
2903  }
2904 #endif
2905 
2906  while( ii > level )
2907  {
2908  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2909  {
2910  }
2911  if( team->t.t_serialized && ( !dd ) ) {
2912  team = team->t.t_parent;
2913  continue;
2914  }
2915  if( ii > level ) {
2916  team = team->t.t_parent;
2917  ii--;
2918  }
2919  }
2920 
2921  return team->t.t_nproc;
2922 }
2923 
2924 kmp_r_sched_t
2925 __kmp_get_schedule_global() {
2926 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2927 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2928 
2929  kmp_r_sched_t r_sched;
2930 
2931  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2932  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2933  // and thus have different run-time schedules in different roots (even in OMP 2.5)
2934  if ( __kmp_sched == kmp_sch_static ) {
2935  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2936  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2937  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2938  } else {
2939  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2940  }
2941 
2942  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2943  r_sched.chunk = KMP_DEFAULT_CHUNK;
2944  } else {
2945  r_sched.chunk = __kmp_chunk;
2946  }
2947 
2948  return r_sched;
2949 }
2950 
2951 /* ------------------------------------------------------------------------ */
2952 /* ------------------------------------------------------------------------ */
2953 
2954 
2955 /*
2956  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2957  * at least argc number of *t_argv entries for the requested team.
2958  */
2959 static void
2960 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2961 {
2962 
2963  KMP_DEBUG_ASSERT( team );
2964  if( !realloc || argc > team->t.t_max_argc ) {
2965 
2966  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2967  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2968  /* if previously allocated heap space for args, free them */
2969  if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2970  __kmp_free( (void *) team->t.t_argv );
2971 
2972  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2973  /* use unused space in the cache line for arguments */
2974  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2975  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2976  team->t.t_id, team->t.t_max_argc ));
2977  team->t.t_argv = &team->t.t_inline_argv[0];
2978  if ( __kmp_storage_map ) {
2979  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2980  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2981  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2982  "team_%d.t_inline_argv",
2983  team->t.t_id );
2984  }
2985  } else {
2986  /* allocate space for arguments in the heap */
2987  team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2988  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2989  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2990  team->t.t_id, team->t.t_max_argc ));
2991  team->t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2992  if ( __kmp_storage_map ) {
2993  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2994  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2995  team->t.t_id );
2996  }
2997  }
2998  }
2999 }
3000 
3001 static void
3002 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
3003 {
3004  int i;
3005  int num_disp_buff = max_nth > 1 ? KMP_MAX_DISP_BUF : 2;
3006 #if KMP_USE_POOLED_ALLOC
3007  // AC: TODO: fix bug here: size of t_disp_buffer should not be multiplied by max_nth!
3008  char *ptr = __kmp_allocate(max_nth *
3009  ( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*num_disp_buf
3010  + sizeof(kmp_disp_t) + sizeof(int)*6
3011  //+ sizeof(int)
3012  + sizeof(kmp_r_sched_t)
3013  + sizeof(kmp_taskdata_t) ) );
3014 
3015  team->t.t_threads = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
3016  team->t.t_disp_buffer = (dispatch_shared_info_t*) ptr;
3017  ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
3018  team->t.t_dispatch = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
3019  team->t.t_set_nproc = (int*) ptr; ptr += sizeof(int) * max_nth;
3020  team->t.t_set_dynamic = (int*) ptr; ptr += sizeof(int) * max_nth;
3021  team->t.t_set_nested = (int*) ptr; ptr += sizeof(int) * max_nth;
3022  team->t.t_set_blocktime = (int*) ptr; ptr += sizeof(int) * max_nth;
3023  team->t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
3024  team->t.t_set_bt_set = (int*) ptr;
3025  ptr += sizeof(int) * max_nth;
3026  //team->t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
3027  team->t.t_set_sched = (kmp_r_sched_t*) ptr;
3028  ptr += sizeof(kmp_r_sched_t) * max_nth;
3029  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
3030  ptr += sizeof(kmp_taskdata_t) * max_nth;
3031 #else
3032 
3033  team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
3034  team->t.t_disp_buffer = (dispatch_shared_info_t*)
3035  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
3036  team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
3037  //team->t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
3038  //team->t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
3039  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
3040 #endif
3041  team->t.t_max_nproc = max_nth;
3042 
3043  /* setup dispatch buffers */
3044  for(i = 0 ; i < num_disp_buff; ++i)
3045  team->t.t_disp_buffer[i].buffer_index = i;
3046 }
3047 
3048 static void
3049 __kmp_free_team_arrays(kmp_team_t *team) {
3050  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3051  int i;
3052  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
3053  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
3054  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
3055  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
3056  }; // if
3057  }; // for
3058  __kmp_free(team->t.t_threads);
3059  #if !KMP_USE_POOLED_ALLOC
3060  __kmp_free(team->t.t_disp_buffer);
3061  __kmp_free(team->t.t_dispatch);
3062  //__kmp_free(team->t.t_set_max_active_levels);
3063  //__kmp_free(team->t.t_set_sched);
3064  __kmp_free(team->t.t_implicit_task_taskdata);
3065  #endif
3066  team->t.t_threads = NULL;
3067  team->t.t_disp_buffer = NULL;
3068  team->t.t_dispatch = NULL;
3069  //team->t.t_set_sched = 0;
3070  //team->t.t_set_max_active_levels = 0;
3071  team->t.t_implicit_task_taskdata = 0;
3072 }
3073 
3074 static void
3075 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3076  kmp_info_t **oldThreads = team->t.t_threads;
3077 
3078  #if !KMP_USE_POOLED_ALLOC
3079  __kmp_free(team->t.t_disp_buffer);
3080  __kmp_free(team->t.t_dispatch);
3081  //__kmp_free(team->t.t_set_max_active_levels);
3082  //__kmp_free(team->t.t_set_sched);
3083  __kmp_free(team->t.t_implicit_task_taskdata);
3084  #endif
3085  __kmp_allocate_team_arrays(team, max_nth);
3086 
3087  KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3088 
3089  __kmp_free(oldThreads);
3090 }
3091 
3092 static kmp_internal_control_t
3093 __kmp_get_global_icvs( void ) {
3094 
3095  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3096 
3097 #if OMP_40_ENABLED
3098  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3099 #endif /* OMP_40_ENABLED */
3100 
3101  kmp_internal_control_t g_icvs = {
3102  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3103  (kmp_int8)__kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
3104  (kmp_int8)__kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
3105  (kmp_int8)__kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
3106  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
3107  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
3108  __kmp_dflt_team_nth, //int nproc; //internal control for # of threads for next parallel region (per thread)
3109  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3110  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
3111  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
3112 #if OMP_40_ENABLED
3113  __kmp_nested_proc_bind.bind_types[0],
3114 #endif /* OMP_40_ENABLED */
3115  NULL //struct kmp_internal_control *next;
3116  };
3117 
3118  return g_icvs;
3119 }
3120 
3121 static kmp_internal_control_t
3122 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3123 
3124  kmp_internal_control_t gx_icvs;
3125  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3126  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3127  gx_icvs.next = NULL;
3128 
3129  return gx_icvs;
3130 }
3131 
3132 static void
3133 __kmp_initialize_root( kmp_root_t *root )
3134 {
3135  int f;
3136  kmp_team_t *root_team;
3137  kmp_team_t *hot_team;
3138  int hot_team_max_nth;
3139  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3140  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3141  KMP_DEBUG_ASSERT( root );
3142  KMP_ASSERT( ! root->r.r_begin );
3143 
3144  /* setup the root state structure */
3145  __kmp_init_lock( &root->r.r_begin_lock );
3146  root->r.r_begin = FALSE;
3147  root->r.r_active = FALSE;
3148  root->r.r_in_parallel = 0;
3149  root->r.r_blocktime = __kmp_dflt_blocktime;
3150  root->r.r_nested = __kmp_dflt_nested;
3151 
3152  /* setup the root team for this task */
3153  /* allocate the root team structure */
3154  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3155 
3156  root_team =
3157  __kmp_allocate_team(
3158  root,
3159  1, // new_nproc
3160  1, // max_nproc
3161 #if OMPT_SUPPORT
3162  0, // root parallel id
3163 #endif
3164 #if OMP_40_ENABLED
3165  __kmp_nested_proc_bind.bind_types[0],
3166 #endif
3167  &r_icvs,
3168  0 // argc
3169  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3170  );
3171 #if USE_DEBUGGER
3172  // Non-NULL value should be assigned to make the debugger display the root team.
3173  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3174 #endif
3175 
3176  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3177 
3178  root->r.r_root_team = root_team;
3179  root_team->t.t_control_stack_top = NULL;
3180 
3181  /* initialize root team */
3182  root_team->t.t_threads[0] = NULL;
3183  root_team->t.t_nproc = 1;
3184  root_team->t.t_serialized = 1;
3185  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3186  root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3187  root_team->t.t_sched.chunk = r_sched.chunk;
3188  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3189  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3190 
3191  /* setup the hot team for this task */
3192  /* allocate the hot team structure */
3193  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3194 
3195  hot_team =
3196  __kmp_allocate_team(
3197  root,
3198  1, // new_nproc
3199  __kmp_dflt_team_nth_ub * 2, // max_nproc
3200 #if OMPT_SUPPORT
3201  0, // root parallel id
3202 #endif
3203 #if OMP_40_ENABLED
3204  __kmp_nested_proc_bind.bind_types[0],
3205 #endif
3206  &r_icvs,
3207  0 // argc
3208  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3209  );
3210  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3211 
3212  root->r.r_hot_team = hot_team;
3213  root_team->t.t_control_stack_top = NULL;
3214 
3215  /* first-time initialization */
3216  hot_team->t.t_parent = root_team;
3217 
3218  /* initialize hot team */
3219  hot_team_max_nth = hot_team->t.t_max_nproc;
3220  for ( f = 0; f < hot_team_max_nth; ++ f ) {
3221  hot_team->t.t_threads[ f ] = NULL;
3222  }; // for
3223  hot_team->t.t_nproc = 1;
3224  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3225  hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3226  hot_team->t.t_sched.chunk = r_sched.chunk;
3227  hot_team->t.t_size_changed = 0;
3228 }
3229 
3230 #ifdef KMP_DEBUG
3231 
3232 
3233 typedef struct kmp_team_list_item {
3234  kmp_team_p const * entry;
3235  struct kmp_team_list_item * next;
3236 } kmp_team_list_item_t;
3237 typedef kmp_team_list_item_t * kmp_team_list_t;
3238 
3239 
3240 static void
3241 __kmp_print_structure_team_accum( // Add team to list of teams.
3242  kmp_team_list_t list, // List of teams.
3243  kmp_team_p const * team // Team to add.
3244 ) {
3245 
3246  // List must terminate with item where both entry and next are NULL.
3247  // Team is added to the list only once.
3248  // List is sorted in ascending order by team id.
3249  // Team id is *not* a key.
3250 
3251  kmp_team_list_t l;
3252 
3253  KMP_DEBUG_ASSERT( list != NULL );
3254  if ( team == NULL ) {
3255  return;
3256  }; // if
3257 
3258  __kmp_print_structure_team_accum( list, team->t.t_parent );
3259  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3260 
3261  // Search list for the team.
3262  l = list;
3263  while ( l->next != NULL && l->entry != team ) {
3264  l = l->next;
3265  }; // while
3266  if ( l->next != NULL ) {
3267  return; // Team has been added before, exit.
3268  }; // if
3269 
3270  // Team is not found. Search list again for insertion point.
3271  l = list;
3272  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3273  l = l->next;
3274  }; // while
3275 
3276  // Insert team.
3277  {
3278  kmp_team_list_item_t * item =
3279  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3280  * item = * l;
3281  l->entry = team;
3282  l->next = item;
3283  }
3284 
3285 }
3286 
3287 static void
3288 __kmp_print_structure_team(
3289  char const * title,
3290  kmp_team_p const * team
3291 
3292 ) {
3293  __kmp_printf( "%s", title );
3294  if ( team != NULL ) {
3295  __kmp_printf( "%2x %p\n", team->t.t_id, team );
3296  } else {
3297  __kmp_printf( " - (nil)\n" );
3298  }; // if
3299 }
3300 
3301 static void
3302 __kmp_print_structure_thread(
3303  char const * title,
3304  kmp_info_p const * thread
3305 
3306 ) {
3307  __kmp_printf( "%s", title );
3308  if ( thread != NULL ) {
3309  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3310  } else {
3311  __kmp_printf( " - (nil)\n" );
3312  }; // if
3313 }
3314 
3315 void
3316 __kmp_print_structure(
3317  void
3318 ) {
3319 
3320  kmp_team_list_t list;
3321 
3322  // Initialize list of teams.
3323  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3324  list->entry = NULL;
3325  list->next = NULL;
3326 
3327  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3328  {
3329  int gtid;
3330  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3331  __kmp_printf( "%2d", gtid );
3332  if ( __kmp_threads != NULL ) {
3333  __kmp_printf( " %p", __kmp_threads[ gtid ] );
3334  }; // if
3335  if ( __kmp_root != NULL ) {
3336  __kmp_printf( " %p", __kmp_root[ gtid ] );
3337  }; // if
3338  __kmp_printf( "\n" );
3339  }; // for gtid
3340  }
3341 
3342  // Print out __kmp_threads array.
3343  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3344  if ( __kmp_threads != NULL ) {
3345  int gtid;
3346  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3347  kmp_info_t const * thread = __kmp_threads[ gtid ];
3348  if ( thread != NULL ) {
3349  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3350  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
3351  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
3352  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
3353  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
3354  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
3355  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
3356  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
3357 #if OMP_40_ENABLED
3358  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
3359 #endif
3360  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
3361  __kmp_printf( "\n" );
3362  __kmp_print_structure_team_accum( list, thread->th.th_team );
3363  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3364  }; // if
3365  }; // for gtid
3366  } else {
3367  __kmp_printf( "Threads array is not allocated.\n" );
3368  }; // if
3369 
3370  // Print out __kmp_root array.
3371  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3372  if ( __kmp_root != NULL ) {
3373  int gtid;
3374  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3375  kmp_root_t const * root = __kmp_root[ gtid ];
3376  if ( root != NULL ) {
3377  __kmp_printf( "GTID %2d %p:\n", gtid, root );
3378  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
3379  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
3380  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
3381  __kmp_printf( " Active?: %2d\n", root->r.r_active );
3382  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
3383  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
3384  __kmp_printf( "\n" );
3385  __kmp_print_structure_team_accum( list, root->r.r_root_team );
3386  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3387  }; // if
3388  }; // for gtid
3389  } else {
3390  __kmp_printf( "Ubers array is not allocated.\n" );
3391  }; // if
3392 
3393  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3394  while ( list->next != NULL ) {
3395  kmp_team_p const * team = list->entry;
3396  int i;
3397  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3398  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
3399  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
3400  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
3401  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
3402  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
3403  for ( i = 0; i < team->t.t_nproc; ++ i ) {
3404  __kmp_printf( " Thread %2d: ", i );
3405  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3406  }; // for i
3407  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
3408  __kmp_printf( "\n" );
3409  list = list->next;
3410  }; // while
3411 
3412  // Print out __kmp_thread_pool and __kmp_team_pool.
3413  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3414  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
3415  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
3416  __kmp_printf( "\n" );
3417 
3418  // Free team list.
3419  while ( list != NULL ) {
3420  kmp_team_list_item_t * item = list;
3421  list = list->next;
3422  KMP_INTERNAL_FREE( item );
3423  }; // while
3424 
3425 }
3426 
3427 #endif
3428 
3429 
3430 //---------------------------------------------------------------------------
3431 // Stuff for per-thread fast random number generator
3432 // Table of primes
3433 
3434 static const unsigned __kmp_primes[] = {
3435  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3436  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3437  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3438  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3439  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3440  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3441  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3442  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3443  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3444  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3445  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3446  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3447  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3448  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3449  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3450  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3451 };
3452 
3453 //---------------------------------------------------------------------------
3454 // __kmp_get_random: Get a random number using a linear congruential method.
3455 
3456 unsigned short
3457 __kmp_get_random( kmp_info_t * thread )
3458 {
3459  unsigned x = thread->th.th_x;
3460  unsigned short r = x>>16;
3461 
3462  thread->th.th_x = x*thread->th.th_a+1;
3463 
3464  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3465  thread->th.th_info.ds.ds_tid, r) );
3466 
3467  return r;
3468 }
3469 //--------------------------------------------------------
3470 // __kmp_init_random: Initialize a random number generator
3471 
3472 void
3473 __kmp_init_random( kmp_info_t * thread )
3474 {
3475  unsigned seed = thread->th.th_info.ds.ds_tid;
3476 
3477  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3478  thread->th.th_x = (seed+1)*thread->th.th_a+1;
3479  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3480 }
3481 
3482 
3483 #if KMP_OS_WINDOWS
3484 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3485 static int
3486 __kmp_reclaim_dead_roots(void) {
3487  int i, r = 0;
3488 
3489  for(i = 0; i < __kmp_threads_capacity; ++i) {
3490  if( KMP_UBER_GTID( i ) &&
3491  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3492  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3493  r += __kmp_unregister_root_other_thread(i);
3494  }
3495  }
3496  return r;
3497 }
3498 #endif
3499 
3500 /*
3501  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3502  free entries generated.
3503 
3504  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3505  already dead.
3506 
3507  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3508  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
3509  __kmp_tp_capacity, if threadprivate cache array has been created.
3510  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3511 
3512  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3513  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
3514  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3515  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
3516  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3517  as many free slots as possible up to nWish.
3518 
3519  If any argument is negative, the behavior is undefined.
3520 */
3521 static int
3522 __kmp_expand_threads(int nWish, int nNeed) {
3523  int added = 0;
3524  int old_tp_cached;
3525  int __kmp_actual_max_nth;
3526 
3527  if(nNeed > nWish) /* normalize the arguments */
3528  nWish = nNeed;
3529 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3530 /* only for Windows static library */
3531  /* reclaim array entries for root threads that are already dead */
3532  added = __kmp_reclaim_dead_roots();
3533 
3534  if(nNeed) {
3535  nNeed -= added;
3536  if(nNeed < 0)
3537  nNeed = 0;
3538  }
3539  if(nWish) {
3540  nWish -= added;
3541  if(nWish < 0)
3542  nWish = 0;
3543  }
3544 #endif
3545  if(nWish <= 0)
3546  return added;
3547 
3548  while(1) {
3549  int nTarget;
3550  int minimumRequiredCapacity;
3551  int newCapacity;
3552  kmp_info_t **newThreads;
3553  kmp_root_t **newRoot;
3554 
3555  //
3556  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3557  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3558  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3559  // become > __kmp_max_nth in one of two ways:
3560  //
3561  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3562  // may not be resused by another thread, so we may need to increase
3563  // __kmp_threads_capacity to __kmp_max_threads + 1.
3564  //
3565  // 2) New foreign root(s) are encountered. We always register new
3566  // foreign roots. This may cause a smaller # of threads to be
3567  // allocated at subsequent parallel regions, but the worker threads
3568  // hang around (and eventually go to sleep) and need slots in the
3569  // __kmp_threads[] array.
3570  //
3571  // Anyway, that is the reason for moving the check to see if
3572  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3573  // instead of having it performed here. -BB
3574  //
3575  old_tp_cached = __kmp_tp_cached;
3576  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3577  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3578 
3579  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3580  nTarget = nWish;
3581  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3582  /* can't fulfil nWish, so try nNeed */
3583  if(nNeed) {
3584  nTarget = nNeed;
3585  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3586  /* possible expansion too small -- give up */
3587  break;
3588  }
3589  } else {
3590  /* best-effort */
3591  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3592  if(!nTarget) {
3593  /* can expand at all -- give up */
3594  break;
3595  }
3596  }
3597  }
3598  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3599 
3600  newCapacity = __kmp_threads_capacity;
3601  do{
3602  newCapacity =
3603  newCapacity <= (__kmp_actual_max_nth >> 1) ?
3604  (newCapacity << 1) :
3605  __kmp_actual_max_nth;
3606  } while(newCapacity < minimumRequiredCapacity);
3607  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3608  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3609  KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3610  KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3611  memset(newThreads + __kmp_threads_capacity, 0,
3612  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3613  memset(newRoot + __kmp_threads_capacity, 0,
3614  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3615 
3616  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3617  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3618  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3619  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
3620  of a double-check pair.
3621  */
3622  __kmp_free(newThreads);
3623  continue; /* start over and try again */
3624  }
3625  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3626  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3627  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3628  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3629  __kmp_free(newThreads);
3630  continue; /* start over and try again */
3631  } else {
3632  /* success */
3633  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3634  //
3635  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3636  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3637  added += newCapacity - __kmp_threads_capacity;
3638  *(volatile int*)&__kmp_threads_capacity = newCapacity;
3639  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3640  break; /* succeeded, so we can exit the loop */
3641  }
3642  }
3643  return added;
3644 }
3645 
3646 /* register the current thread as a root thread and obtain our gtid */
3647 /* we must have the __kmp_initz_lock held at this point */
3648 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3649 int
3650 __kmp_register_root( int initial_thread )
3651 {
3652  kmp_info_t *root_thread;
3653  kmp_root_t *root;
3654  int gtid;
3655  int capacity;
3656  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3657  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3658  KMP_MB();
3659 
3660 
3661  /*
3662  2007-03-02:
3663 
3664  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3665  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3666  return false (that means there is at least one empty slot in __kmp_threads array), but it
3667  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3668  used for this one. Following code workarounds this bug.
3669 
3670  However, right solution seems to be not reserving slot #0 for initial thread because:
3671  (1) there is no magic in slot #0,
3672  (2) we cannot detect initial thread reliably (the first thread which does serial
3673  initialization may be not a real initial thread).
3674  */
3675  capacity = __kmp_threads_capacity;
3676  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3677  -- capacity;
3678  }; // if
3679 
3680  /* see if there are too many threads */
3681  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3682  if ( __kmp_tp_cached ) {
3683  __kmp_msg(
3684  kmp_ms_fatal,
3685  KMP_MSG( CantRegisterNewThread ),
3686  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3687  KMP_HNT( PossibleSystemLimitOnThreads ),
3688  __kmp_msg_null
3689  );
3690  }
3691  else {
3692  __kmp_msg(
3693  kmp_ms_fatal,
3694  KMP_MSG( CantRegisterNewThread ),
3695  KMP_HNT( SystemLimitOnThreads ),
3696  __kmp_msg_null
3697  );
3698  }
3699  }; // if
3700 
3701  /* find an available thread slot */
3702  /* Don't reassign the zero slot since we need that to only be used by initial
3703  thread */
3704  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3705  ;
3706  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3707  KMP_ASSERT( gtid < __kmp_threads_capacity );
3708 
3709  /* update global accounting */
3710  __kmp_all_nth ++;
3711  TCW_4(__kmp_nth, __kmp_nth + 1);
3712 
3713  //
3714  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3715  // for low numbers of procs, and method #2 (keyed API call) for higher
3716  // numbers of procs.
3717  //
3718  if ( __kmp_adjust_gtid_mode ) {
3719  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3720  if ( TCR_4(__kmp_gtid_mode) != 2) {
3721  TCW_4(__kmp_gtid_mode, 2);
3722  }
3723  }
3724  else {
3725  if (TCR_4(__kmp_gtid_mode) != 1 ) {
3726  TCW_4(__kmp_gtid_mode, 1);
3727  }
3728  }
3729  }
3730 
3731 #ifdef KMP_ADJUST_BLOCKTIME
3732  /* Adjust blocktime to zero if necessary */
3733  /* Middle initialization might not have occurred yet */
3734  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3735  if ( __kmp_nth > __kmp_avail_proc ) {
3736  __kmp_zero_bt = TRUE;
3737  }
3738  }
3739 #endif /* KMP_ADJUST_BLOCKTIME */
3740 
3741  /* setup this new hierarchy */
3742  if( ! ( root = __kmp_root[gtid] )) {
3743  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3744  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3745  }
3746 
3747  __kmp_initialize_root( root );
3748 
3749  /* setup new root thread structure */
3750  if( root->r.r_uber_thread ) {
3751  root_thread = root->r.r_uber_thread;
3752  } else {
3753  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3754  if ( __kmp_storage_map ) {
3755  __kmp_print_thread_storage_map( root_thread, gtid );
3756  }
3757  root_thread->th.th_info .ds.ds_gtid = gtid;
3758  root_thread->th.th_root = root;
3759  if( __kmp_env_consistency_check ) {
3760  root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3761  }
3762  #if USE_FAST_MEMORY
3763  __kmp_initialize_fast_memory( root_thread );
3764  #endif /* USE_FAST_MEMORY */
3765 
3766  #if KMP_USE_BGET
3767  KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3768  __kmp_initialize_bget( root_thread );
3769  #endif
3770  __kmp_init_random( root_thread ); // Initialize random number generator
3771  }
3772 
3773  /* setup the serial team held in reserve by the root thread */
3774  if( ! root_thread->th.th_serial_team ) {
3775  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3776  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3777 
3778  root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3779 #if OMPT_SUPPORT
3780  0, // root parallel id
3781 #endif
3782 #if OMP_40_ENABLED
3783  proc_bind_default,
3784 #endif
3785  &r_icvs,
3786  0 USE_NESTED_HOT_ARG(NULL) );
3787  }
3788  KMP_ASSERT( root_thread->th.th_serial_team );
3789  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3790  root_thread->th.th_serial_team ) );
3791 
3792  /* drop root_thread into place */
3793  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3794 
3795  root->r.r_root_team->t.t_threads[0] = root_thread;
3796  root->r.r_hot_team ->t.t_threads[0] = root_thread;
3797  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3798  root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3799  root->r.r_uber_thread = root_thread;
3800 
3801  /* initialize the thread, get it ready to go */
3802  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3803 
3804  /* prepare the master thread for get_gtid() */
3805  __kmp_gtid_set_specific( gtid );
3806 
3807  __kmp_itt_thread_name( gtid );
3808 
3809  #ifdef KMP_TDATA_GTID
3810  __kmp_gtid = gtid;
3811  #endif
3812  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3813  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3814  TCW_4(__kmp_init_gtid, TRUE);
3815 
3816  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3817  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3818  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3819  KMP_INIT_BARRIER_STATE ) );
3820  { // Initialize barrier data.
3821  int b;
3822  for ( b = 0; b < bs_last_barrier; ++ b ) {
3823  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3824 #if USE_DEBUGGER
3825  root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3826 #endif
3827  }; // for
3828  }
3829  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3830 
3831 #if KMP_AFFINITY_SUPPORTED
3832  if ( TCR_4(__kmp_init_middle) ) {
3833  __kmp_affinity_set_init_mask( gtid, TRUE );
3834  }
3835 #endif /* KMP_AFFINITY_SUPPORTED */
3836 
3837  __kmp_root_counter ++;
3838 
3839  KMP_MB();
3840  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3841 
3842  return gtid;
3843 }
3844 
3845 #if KMP_NESTED_HOT_TEAMS
3846 static int
3847 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3848 {
3849  int i, n, nth;
3850  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3851  if( !hot_teams || !hot_teams[level].hot_team ) {
3852  return 0;
3853  }
3854  KMP_DEBUG_ASSERT( level < max_level );
3855  kmp_team_t *team = hot_teams[level].hot_team;
3856  nth = hot_teams[level].hot_team_nth;
3857  n = nth - 1; // master is not freed
3858  if( level < max_level - 1 ) {
3859  for( i = 0; i < nth; ++i ) {
3860  kmp_info_t *th = team->t.t_threads[i];
3861  n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3862  if( i > 0 && th->th.th_hot_teams ) {
3863  __kmp_free( th->th.th_hot_teams );
3864  th->th.th_hot_teams = NULL;
3865  }
3866  }
3867  }
3868  __kmp_free_team( root, team, NULL );
3869  return n;
3870 }
3871 #endif
3872 
3873 /* Resets a root thread and clear its root and hot teams.
3874  Returns the number of __kmp_threads entries directly and indirectly freed.
3875 */
3876 static int
3877 __kmp_reset_root(int gtid, kmp_root_t *root)
3878 {
3879  kmp_team_t * root_team = root->r.r_root_team;
3880  kmp_team_t * hot_team = root->r.r_hot_team;
3881  int n = hot_team->t.t_nproc;
3882  int i;
3883 
3884  KMP_DEBUG_ASSERT( ! root->r.r_active );
3885 
3886  root->r.r_root_team = NULL;
3887  root->r.r_hot_team = NULL;
3888  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3889  // to __kmp_free_team().
3890  __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3891 #if KMP_NESTED_HOT_TEAMS
3892  if( __kmp_hot_teams_max_level > 1 ) { // need to free nested hot teams and their threads if any
3893  for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3894  kmp_info_t *th = hot_team->t.t_threads[i];
3895  n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3896  if( th->th.th_hot_teams ) {
3897  __kmp_free( th->th.th_hot_teams );
3898  th->th.th_hot_teams = NULL;
3899  }
3900  }
3901  }
3902 #endif
3903  __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3904 
3905  //
3906  // Before we can reap the thread, we need to make certain that all
3907  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3908  //
3909  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3910  __kmp_wait_to_unref_task_teams();
3911  }
3912 
3913  #if KMP_OS_WINDOWS
3914  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3915  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3916  (LPVOID)&(root->r.r_uber_thread->th),
3917  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3918  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3919  #endif /* KMP_OS_WINDOWS */
3920 
3921 #if OMPT_SUPPORT
3922  if (ompt_enabled &&
3923  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3924  int gtid = __kmp_get_gtid();
3925  __ompt_thread_end(ompt_thread_initial, gtid);
3926  }
3927 #endif
3928 
3929  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3930  __kmp_reap_thread( root->r.r_uber_thread, 1 );
3931 
3932  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3933  root->r.r_uber_thread = NULL;
3934  /* mark root as no longer in use */
3935  root->r.r_begin = FALSE;
3936 
3937  return n;
3938 }
3939 
3940 void
3941 __kmp_unregister_root_current_thread( int gtid )
3942 {
3943  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3944  /* this lock should be ok, since unregister_root_current_thread is never called during
3945  * and abort, only during a normal close. furthermore, if you have the
3946  * forkjoin lock, you should never try to get the initz lock */
3947 
3948  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3949  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3950  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3951  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3952  return;
3953  }
3954  kmp_root_t *root = __kmp_root[gtid];
3955 
3956  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3957  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3958  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3959  KMP_ASSERT( root->r.r_active == FALSE );
3960 
3961 
3962  KMP_MB();
3963 
3964 #if OMP_41_ENABLED
3965  kmp_info_t * thread = __kmp_threads[gtid];
3966  kmp_team_t * team = thread->th.th_team;
3967  kmp_task_team_t * task_team = thread->th.th_task_team;
3968 
3969  // we need to wait for the proxy tasks before finishing the thread
3970  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) {
3971 #if OMPT_SUPPORT
3972  // the runtime is shutting down so we won't report any events
3973  thread->th.ompt_thread_info.state = ompt_state_undefined;
3974 #endif
3975  __kmp_task_team_wait(thread, team, NULL );
3976  }
3977 #endif
3978 
3979  __kmp_reset_root(gtid, root);
3980 
3981  /* free up this thread slot */
3982  __kmp_gtid_set_specific( KMP_GTID_DNE );
3983 #ifdef KMP_TDATA_GTID
3984  __kmp_gtid = KMP_GTID_DNE;
3985 #endif
3986 
3987  KMP_MB();
3988  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3989 
3990  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3991 }
3992 
3993 #if KMP_OS_WINDOWS
3994 /* __kmp_forkjoin_lock must be already held
3995  Unregisters a root thread that is not the current thread. Returns the number of
3996  __kmp_threads entries freed as a result.
3997  */
3998 static int
3999 __kmp_unregister_root_other_thread( int gtid )
4000 {
4001  kmp_root_t *root = __kmp_root[gtid];
4002  int r;
4003 
4004  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
4005  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
4006  KMP_ASSERT( KMP_UBER_GTID( gtid ));
4007  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
4008  KMP_ASSERT( root->r.r_active == FALSE );
4009 
4010  r = __kmp_reset_root(gtid, root);
4011  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
4012  return r;
4013 }
4014 #endif
4015 
4016 #if KMP_DEBUG
4017 void __kmp_task_info() {
4018 
4019  kmp_int32 gtid = __kmp_entry_gtid();
4020  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
4021  kmp_info_t *this_thr = __kmp_threads[ gtid ];
4022  kmp_team_t *steam = this_thr->th.th_serial_team;
4023  kmp_team_t *team = this_thr->th.th_team;
4024 
4025  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
4026  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
4027 }
4028 #endif // KMP_DEBUG
4029 
4030 /* TODO optimize with one big memclr, take out what isn't needed,
4031  * split responsibility to workers as much as possible, and delay
4032  * initialization of features as much as possible */
4033 static void
4034 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
4035 {
4036  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
4037  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4038  kmp_info_t *master = team->t.t_threads[0];
4039  KMP_DEBUG_ASSERT( this_thr != NULL );
4040  KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
4041  KMP_DEBUG_ASSERT( team );
4042  KMP_DEBUG_ASSERT( team->t.t_threads );
4043  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4044  KMP_DEBUG_ASSERT( master );
4045  KMP_DEBUG_ASSERT( master->th.th_root );
4046 
4047  KMP_MB();
4048 
4049  TCW_SYNC_PTR(this_thr->th.th_team, team);
4050 
4051  this_thr->th.th_info.ds.ds_tid = tid;
4052  this_thr->th.th_set_nproc = 0;
4053 #if OMP_40_ENABLED
4054  this_thr->th.th_set_proc_bind = proc_bind_default;
4055 # if KMP_AFFINITY_SUPPORTED
4056  this_thr->th.th_new_place = this_thr->th.th_current_place;
4057 # endif
4058 #endif
4059  this_thr->th.th_root = master->th.th_root;
4060 
4061  /* setup the thread's cache of the team structure */
4062  this_thr->th.th_team_nproc = team->t.t_nproc;
4063  this_thr->th.th_team_master = master;
4064  this_thr->th.th_team_serialized = team->t.t_serialized;
4065  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4066 
4067  KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4068 
4069  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4070  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4071 
4072  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4073 
4074  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4075  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4076  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4077 
4078  /* TODO no worksharing in speculative threads */
4079  this_thr->th.th_dispatch = &team->t.t_dispatch[ tid ];
4080 
4081  this_thr->th.th_local.this_construct = 0;
4082 
4083 #ifdef BUILD_TV
4084  this_thr->th.th_local.tv_data = 0;
4085 #endif
4086 
4087  if ( ! this_thr->th.th_pri_common ) {
4088  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4089  if ( __kmp_storage_map ) {
4090  __kmp_print_storage_map_gtid(
4091  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4092  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4093  );
4094  }; // if
4095  this_thr->th.th_pri_head = NULL;
4096  }; // if
4097 
4098  /* Initialize dynamic dispatch */
4099  {
4100  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4101  /*
4102  * Use team max_nproc since this will never change for the team.
4103  */
4104  size_t disp_size = sizeof( dispatch_private_info_t ) *
4105  ( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
4106  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4107  KMP_ASSERT( dispatch );
4108  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4109  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4110 
4111  dispatch->th_disp_index = 0;
4112 
4113  if( ! dispatch->th_disp_buffer ) {
4114  dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4115 
4116  if ( __kmp_storage_map ) {
4117  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4118  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],
4119  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4120  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4121  gtid, team->t.t_id, gtid );
4122  }
4123  } else {
4124  memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4125  }
4126 
4127  dispatch->th_dispatch_pr_current = 0;
4128  dispatch->th_dispatch_sh_current = 0;
4129 
4130  dispatch->th_deo_fcn = 0; /* ORDERED */
4131  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4132  }
4133 
4134  this_thr->th.th_next_pool = NULL;
4135 
4136  if (!this_thr->th.th_task_state_memo_stack) {
4137  size_t i;
4138  this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4139  this_thr->th.th_task_state_top = 0;
4140  this_thr->th.th_task_state_stack_sz = 4;
4141  for (i=0; i<this_thr->th.th_task_state_stack_sz; ++i) // zero init the stack
4142  this_thr->th.th_task_state_memo_stack[i] = 0;
4143  }
4144 
4145  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4146  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4147 
4148  KMP_MB();
4149 }
4150 
4151 
4152 /* allocate a new thread for the requesting team. this is only called from within a
4153  * forkjoin critical section. we will first try to get an available thread from the
4154  * thread pool. if none is available, we will fork a new one assuming we are able
4155  * to create a new one. this should be assured, as the caller should check on this
4156  * first.
4157  */
4158 kmp_info_t *
4159 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4160 {
4161  kmp_team_t *serial_team;
4162  kmp_info_t *new_thr;
4163  int new_gtid;
4164 
4165  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4166  KMP_DEBUG_ASSERT( root && team );
4167 #if !KMP_NESTED_HOT_TEAMS
4168  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4169 #endif
4170  KMP_MB();
4171 
4172  /* first, try to get one from the thread pool */
4173  if ( __kmp_thread_pool ) {
4174 
4175  new_thr = (kmp_info_t*)__kmp_thread_pool;
4176  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4177  if ( new_thr == __kmp_thread_pool_insert_pt ) {
4178  __kmp_thread_pool_insert_pt = NULL;
4179  }
4180  TCW_4(new_thr->th.th_in_pool, FALSE);
4181  //
4182  // Don't touch th_active_in_pool or th_active.
4183  // The worker thread adjusts those flags as it sleeps/awakens.
4184  //
4185 
4186  __kmp_thread_pool_nth--;
4187 
4188  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4189  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4190  KMP_ASSERT( ! new_thr->th.th_team );
4191  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4192  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4193 
4194  /* setup the thread structure */
4195  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4196  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4197 
4198  TCW_4(__kmp_nth, __kmp_nth + 1);
4199 
4200  new_thr->th.th_task_state = 0;
4201  new_thr->th.th_task_state_top = 0;
4202  new_thr->th.th_task_state_stack_sz = 4;
4203 
4204 #ifdef KMP_ADJUST_BLOCKTIME
4205  /* Adjust blocktime back to zero if necessar y */
4206  /* Middle initialization might not have occurred yet */
4207  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4208  if ( __kmp_nth > __kmp_avail_proc ) {
4209  __kmp_zero_bt = TRUE;
4210  }
4211  }
4212 #endif /* KMP_ADJUST_BLOCKTIME */
4213 
4214 #if KMP_DEBUG
4215  // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4216  int b;
4217  kmp_balign_t * balign = new_thr->th.th_bar;
4218  for( b = 0; b < bs_last_barrier; ++ b )
4219  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4220 #endif
4221 
4222  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4223  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4224 
4225  KMP_MB();
4226  return new_thr;
4227  }
4228 
4229 
4230  /* no, well fork a new one */
4231  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
4232  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4233 
4234  //
4235  // If this is the first worker thread the RTL is creating, then also
4236  // launch the monitor thread. We try to do this as early as possible.
4237  //
4238  if ( ! TCR_4( __kmp_init_monitor ) ) {
4239  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4240  if ( ! TCR_4( __kmp_init_monitor ) ) {
4241  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4242  TCW_4( __kmp_init_monitor, 1 );
4243  __kmp_create_monitor( & __kmp_monitor );
4244  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4245  #if KMP_OS_WINDOWS
4246  // AC: wait until monitor has started. This is a fix for CQ232808.
4247  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4248  // work in between, then there is high probability that monitor thread started after
4249  // the library shutdown. At shutdown it is too late to cope with the problem, because
4250  // when the master is in DllMain (process detach) the monitor has no chances to start
4251  // (it is blocked), and master has no means to inform the monitor that the library has gone,
4252  // because all the memory which the monitor can access is going to be released/reset.
4253  while ( TCR_4(__kmp_init_monitor) < 2 ) {
4254  KMP_YIELD( TRUE );
4255  }
4256  KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4257  #endif
4258  }
4259  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4260  }
4261 
4262  KMP_MB();
4263  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4264  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4265  }
4266 
4267  /* allocate space for it. */
4268  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4269 
4270  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4271 
4272  if ( __kmp_storage_map ) {
4273  __kmp_print_thread_storage_map( new_thr, new_gtid );
4274  }
4275 
4276  /* add the reserve serialized team, initialized from the team's master thread */
4277  {
4278  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4279  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4280 
4281  new_thr->th.th_serial_team = serial_team =
4282  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4283 #if OMPT_SUPPORT
4284  0, // root parallel id
4285 #endif
4286 #if OMP_40_ENABLED
4287  proc_bind_default,
4288 #endif
4289  &r_icvs,
4290  0 USE_NESTED_HOT_ARG(NULL) );
4291  }
4292  KMP_ASSERT ( serial_team );
4293  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
4294  serial_team->t.t_threads[0] = new_thr;
4295  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4296  new_thr ) );
4297 
4298  /* setup the thread structures */
4299  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4300 
4301  #if USE_FAST_MEMORY
4302  __kmp_initialize_fast_memory( new_thr );
4303  #endif /* USE_FAST_MEMORY */
4304 
4305  #if KMP_USE_BGET
4306  KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4307  __kmp_initialize_bget( new_thr );
4308  #endif
4309 
4310  __kmp_init_random( new_thr ); // Initialize random number generator
4311 
4312  /* Initialize these only once when thread is grabbed for a team allocation */
4313  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4314  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4315 
4316  int b;
4317  kmp_balign_t * balign = new_thr->th.th_bar;
4318  for(b=0; b<bs_last_barrier; ++b) {
4319  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4320  balign[b].bb.team = NULL;
4321  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4322  balign[b].bb.use_oncore_barrier = 0;
4323  }
4324 
4325  new_thr->th.th_spin_here = FALSE;
4326  new_thr->th.th_next_waiting = 0;
4327 
4328 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4329  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4330  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4331  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4332  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4333 #endif
4334 
4335  TCW_4(new_thr->th.th_in_pool, FALSE);
4336  new_thr->th.th_active_in_pool = FALSE;
4337  TCW_4(new_thr->th.th_active, TRUE);
4338 
4339  /* adjust the global counters */
4340  __kmp_all_nth ++;
4341  __kmp_nth ++;
4342 
4343  //
4344  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4345  // for low numbers of procs, and method #2 (keyed API call) for higher
4346  // numbers of procs.
4347  //
4348  if ( __kmp_adjust_gtid_mode ) {
4349  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4350  if ( TCR_4(__kmp_gtid_mode) != 2) {
4351  TCW_4(__kmp_gtid_mode, 2);
4352  }
4353  }
4354  else {
4355  if (TCR_4(__kmp_gtid_mode) != 1 ) {
4356  TCW_4(__kmp_gtid_mode, 1);
4357  }
4358  }
4359  }
4360 
4361 #ifdef KMP_ADJUST_BLOCKTIME
4362  /* Adjust blocktime back to zero if necessary */
4363  /* Middle initialization might not have occurred yet */
4364  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4365  if ( __kmp_nth > __kmp_avail_proc ) {
4366  __kmp_zero_bt = TRUE;
4367  }
4368  }
4369 #endif /* KMP_ADJUST_BLOCKTIME */
4370 
4371  /* actually fork it and create the new worker thread */
4372  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4373  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4374  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4375 
4376 
4377  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4378  KMP_MB();
4379  return new_thr;
4380 }
4381 
4382 /*
4383  * reinitialize team for reuse.
4384  *
4385  * The hot team code calls this case at every fork barrier, so EPCC barrier
4386  * test are extremely sensitive to changes in it, esp. writes to the team
4387  * struct, which cause a cache invalidation in all threads.
4388  *
4389  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4390  */
4391 static void
4392 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4393  KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4394  team->t.t_threads[0], team ) );
4395  KMP_DEBUG_ASSERT( team && new_icvs);
4396  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4397  team->t.t_ident = loc;
4398 
4399  team->t.t_id = KMP_GEN_TEAM_ID();
4400 
4401  // Copy ICVs to the master thread's implicit taskdata
4402  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4403  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4404 
4405  KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4406  team->t.t_threads[0], team ) );
4407 }
4408 
4409 
4410 /* initialize the team data structure
4411  * this assumes the t_threads and t_max_nproc are already set
4412  * also, we don't touch the arguments */
4413 static void
4414 __kmp_initialize_team(
4415  kmp_team_t * team,
4416  int new_nproc,
4417  kmp_internal_control_t * new_icvs,
4418  ident_t * loc
4419 ) {
4420  KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4421 
4422  /* verify */
4423  KMP_DEBUG_ASSERT( team );
4424  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4425  KMP_DEBUG_ASSERT( team->t.t_threads );
4426  KMP_MB();
4427 
4428  team->t.t_master_tid = 0; /* not needed */
4429  /* team->t.t_master_bar; not needed */
4430  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4431  team->t.t_nproc = new_nproc;
4432 
4433  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4434  team->t.t_next_pool = NULL;
4435  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4436 
4437  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4438  team->t.t_invoke = NULL; /* not needed */
4439 
4440  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4441  team->t.t_sched = new_icvs->sched;
4442 
4443 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4444  team->t.t_fp_control_saved = FALSE; /* not needed */
4445  team->t.t_x87_fpu_control_word = 0; /* not needed */
4446  team->t.t_mxcsr = 0; /* not needed */
4447 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4448 
4449  team->t.t_construct = 0;
4450  __kmp_init_lock( & team->t.t_single_lock );
4451 
4452  team->t.t_ordered .dt.t_value = 0;
4453  team->t.t_master_active = FALSE;
4454 
4455  memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4456 
4457 #ifdef KMP_DEBUG
4458  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4459 #endif
4460  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4461 
4462  team->t.t_control_stack_top = NULL;
4463 
4464  __kmp_reinitialize_team( team, new_icvs, loc );
4465 
4466  KMP_MB();
4467  KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4468 }
4469 
4470 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4471 /* Sets full mask for thread and returns old mask, no changes to structures. */
4472 static void
4473 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4474 {
4475  if ( KMP_AFFINITY_CAPABLE() ) {
4476  int status;
4477  if ( old_mask != NULL ) {
4478  status = __kmp_get_system_affinity( old_mask, TRUE );
4479  int error = errno;
4480  if ( status != 0 ) {
4481  __kmp_msg(
4482  kmp_ms_fatal,
4483  KMP_MSG( ChangeThreadAffMaskError ),
4484  KMP_ERR( error ),
4485  __kmp_msg_null
4486  );
4487  }
4488  }
4489  __kmp_set_system_affinity( __kmp_affinity_get_fullMask(), TRUE );
4490  }
4491 }
4492 #endif
4493 
4494 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4495 
4496 //
4497 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4498 // It calculats the worker + master thread's partition based upon the parent
4499 // thread's partition, and binds each worker to a thread in their partition.
4500 // The master thread's partition should already include its current binding.
4501 //
4502 static void
4503 __kmp_partition_places( kmp_team_t *team )
4504 {
4505  //
4506  // Copy the master thread's place partion to the team struct
4507  //
4508  kmp_info_t *master_th = team->t.t_threads[0];
4509  KMP_DEBUG_ASSERT( master_th != NULL );
4510  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4511  int first_place = master_th->th.th_first_place;
4512  int last_place = master_th->th.th_last_place;
4513  int masters_place = master_th->th.th_current_place;
4514  team->t.t_first_place = first_place;
4515  team->t.t_last_place = last_place;
4516 
4517  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4518  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4519  masters_place, first_place, last_place ) );
4520 
4521  switch ( proc_bind ) {
4522 
4523  case proc_bind_default:
4524  //
4525  // serial teams might have the proc_bind policy set to
4526  // proc_bind_default. It doesn't matter, as we don't
4527  // rebind the master thread for any proc_bind policy.
4528  //
4529  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4530  break;
4531 
4532  case proc_bind_master:
4533  {
4534  int f;
4535  int n_th = team->t.t_nproc;
4536  for ( f = 1; f < n_th; f++ ) {
4537  kmp_info_t *th = team->t.t_threads[f];
4538  KMP_DEBUG_ASSERT( th != NULL );
4539  th->th.th_first_place = first_place;
4540  th->th.th_last_place = last_place;
4541  th->th.th_new_place = masters_place;
4542 
4543  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4544  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4545  team->t.t_id, f, masters_place, first_place, last_place ) );
4546  }
4547  }
4548  break;
4549 
4550  case proc_bind_close:
4551  {
4552  int f;
4553  int n_th = team->t.t_nproc;
4554  int n_places;
4555  if ( first_place <= last_place ) {
4556  n_places = last_place - first_place + 1;
4557  }
4558  else {
4559  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4560  }
4561  if ( n_th <= n_places ) {
4562  int place = masters_place;
4563  for ( f = 1; f < n_th; f++ ) {
4564  kmp_info_t *th = team->t.t_threads[f];
4565  KMP_DEBUG_ASSERT( th != NULL );
4566 
4567  if ( place == last_place ) {
4568  place = first_place;
4569  }
4570  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4571  place = 0;
4572  }
4573  else {
4574  place++;
4575  }
4576  th->th.th_first_place = first_place;
4577  th->th.th_last_place = last_place;
4578  th->th.th_new_place = place;
4579 
4580  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4581  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4582  team->t.t_id, f, place, first_place, last_place ) );
4583  }
4584  }
4585  else {
4586  int S, rem, gap, s_count;
4587  S = n_th / n_places;
4588  s_count = 0;
4589  rem = n_th - ( S * n_places );
4590  gap = rem > 0 ? n_places/rem : n_places;
4591  int place = masters_place;
4592  int gap_ct = gap;
4593  for ( f = 0; f < n_th; f++ ) {
4594  kmp_info_t *th = team->t.t_threads[f];
4595  KMP_DEBUG_ASSERT( th != NULL );
4596 
4597  th->th.th_first_place = first_place;
4598  th->th.th_last_place = last_place;
4599  th->th.th_new_place = place;
4600  s_count++;
4601 
4602  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4603  // do nothing, add an extra thread to place on next iteration
4604  }
4605  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4606  // we added an extra thread to this place; move to next place
4607  if ( place == last_place ) {
4608  place = first_place;
4609  }
4610  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4611  place = 0;
4612  }
4613  else {
4614  place++;
4615  }
4616  s_count = 0;
4617  gap_ct = 1;
4618  rem--;
4619  }
4620  else if (s_count == S) { // place full; don't add extra
4621  if ( place == last_place ) {
4622  place = first_place;
4623  }
4624  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4625  place = 0;
4626  }
4627  else {
4628  place++;
4629  }
4630  gap_ct++;
4631  s_count = 0;
4632  }
4633 
4634  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4635  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4636  team->t.t_id, f, th->th.th_new_place, first_place,
4637  last_place ) );
4638  }
4639  KMP_DEBUG_ASSERT( place == masters_place );
4640  }
4641  }
4642  break;
4643 
4644  case proc_bind_spread:
4645  {
4646  int f;
4647  int n_th = team->t.t_nproc;
4648  int n_places;
4649  if ( first_place <= last_place ) {
4650  n_places = last_place - first_place + 1;
4651  }
4652  else {
4653  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4654  }
4655  if ( n_th <= n_places ) {
4656  int place = masters_place;
4657  int S = n_places/n_th;
4658  int s_count, rem, gap, gap_ct;
4659  rem = n_places - n_th*S;
4660  gap = rem ? n_th/rem : 1;
4661  gap_ct = gap;
4662  for ( f = 0; f < n_th; f++ ) {
4663  kmp_info_t *th = team->t.t_threads[f];
4664  KMP_DEBUG_ASSERT( th != NULL );
4665 
4666  th->th.th_first_place = place;
4667  th->th.th_new_place = place;
4668  s_count = 1;
4669  while (s_count < S) {
4670  if ( place == last_place ) {
4671  place = first_place;
4672  }
4673  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4674  place = 0;
4675  }
4676  else {
4677  place++;
4678  }
4679  s_count++;
4680  }
4681  if (rem && (gap_ct == gap)) {
4682  if ( place == last_place ) {
4683  place = first_place;
4684  }
4685  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4686  place = 0;
4687  }
4688  else {
4689  place++;
4690  }
4691  rem--;
4692  gap_ct = 0;
4693  }
4694  th->th.th_last_place = place;
4695  gap_ct++;
4696 
4697  if ( place == last_place ) {
4698  place = first_place;
4699  }
4700  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4701  place = 0;
4702  }
4703  else {
4704  place++;
4705  }
4706 
4707  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4708  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4709  team->t.t_id, f, th->th.th_new_place,
4710  th->th.th_first_place, th->th.th_last_place ) );
4711  }
4712  KMP_DEBUG_ASSERT( place == masters_place );
4713  }
4714  else {
4715  int S, rem, gap, s_count;
4716  S = n_th / n_places;
4717  s_count = 0;
4718  rem = n_th - ( S * n_places );
4719  gap = rem > 0 ? n_places/rem : n_places;
4720  int place = masters_place;
4721  int gap_ct = gap;
4722  for ( f = 0; f < n_th; f++ ) {
4723  kmp_info_t *th = team->t.t_threads[f];
4724  KMP_DEBUG_ASSERT( th != NULL );
4725 
4726  th->th.th_first_place = place;
4727  th->th.th_last_place = place;
4728  th->th.th_new_place = place;
4729  s_count++;
4730 
4731  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4732  // do nothing, add an extra thread to place on next iteration
4733  }
4734  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4735  // we added an extra thread to this place; move on to next place
4736  if ( place == last_place ) {
4737  place = first_place;
4738  }
4739  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4740  place = 0;
4741  }
4742  else {
4743  place++;
4744  }
4745  s_count = 0;
4746  gap_ct = 1;
4747  rem--;
4748  }
4749  else if (s_count == S) { // place is full; don't add extra thread
4750  if ( place == last_place ) {
4751  place = first_place;
4752  }
4753  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4754  place = 0;
4755  }
4756  else {
4757  place++;
4758  }
4759  gap_ct++;
4760  s_count = 0;
4761  }
4762 
4763  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4764  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4765  team->t.t_id, f, th->th.th_new_place,
4766  th->th.th_first_place, th->th.th_last_place) );
4767  }
4768  KMP_DEBUG_ASSERT( place == masters_place );
4769  }
4770  }
4771  break;
4772 
4773  default:
4774  break;
4775  }
4776 
4777  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4778 }
4779 
4780 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4781 
4782 /* allocate a new team data structure to use. take one off of the free pool if available */
4783 kmp_team_t *
4784 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4785 #if OMPT_SUPPORT
4786  ompt_parallel_id_t ompt_parallel_id,
4787 #endif
4788 #if OMP_40_ENABLED
4789  kmp_proc_bind_t new_proc_bind,
4790 #endif
4791  kmp_internal_control_t *new_icvs,
4792  int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4793 {
4794  KMP_TIME_DEVELOPER_BLOCK(KMP_allocate_team);
4795  int f;
4796  kmp_team_t *team;
4797  int use_hot_team = ! root->r.r_active;
4798  int level = 0;
4799 
4800  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4801  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4802  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4803  KMP_MB();
4804 
4805 #if KMP_NESTED_HOT_TEAMS
4806  kmp_hot_team_ptr_t *hot_teams;
4807  if( master ) {
4808  team = master->th.th_team;
4809  level = team->t.t_active_level;
4810  if( master->th.th_teams_microtask ) { // in teams construct?
4811  if( master->th.th_teams_size.nteams > 1 && ( // #teams > 1
4812  team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4813  master->th.th_teams_level < team->t.t_level ) ) { // or nested parallel inside the teams
4814  ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4815  }
4816  }
4817  hot_teams = master->th.th_hot_teams;
4818  if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4819  { // hot team has already been allocated for given level
4820  use_hot_team = 1;
4821  } else {
4822  use_hot_team = 0;
4823  }
4824  }
4825 #endif
4826  // Optimization to use a "hot" team
4827  if( use_hot_team && new_nproc > 1 ) {
4828  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4829 #if KMP_NESTED_HOT_TEAMS
4830  team = hot_teams[level].hot_team;
4831 #else
4832  team = root->r.r_hot_team;
4833 #endif
4834 #if KMP_DEBUG
4835  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4836  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4837  team->t.t_task_team[0], team->t.t_task_team[1] ));
4838  }
4839 #endif
4840 
4841  // Has the number of threads changed?
4842  /* Let's assume the most common case is that the number of threads is unchanged, and
4843  put that case first. */
4844  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4845  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4846  // This case can mean that omp_set_num_threads() was called and the hot team size
4847  // was already reduced, so we check the special flag
4848  if ( team->t.t_size_changed == -1 ) {
4849  team->t.t_size_changed = 1;
4850  } else {
4851  team->t.t_size_changed = 0;
4852  }
4853 
4854  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4855  team->t.t_sched = new_icvs->sched;
4856 
4857  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4858 
4859  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4860  0, team->t.t_threads[0], team ) );
4861  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4862 
4863 #if OMP_40_ENABLED
4864 # if KMP_AFFINITY_SUPPORTED
4865  if ( ( team->t.t_size_changed == 0 )
4866  && ( team->t.t_proc_bind == new_proc_bind ) ) {
4867  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4868  team->t.t_id, new_proc_bind, team->t.t_first_place,
4869  team->t.t_last_place ) );
4870  }
4871  else {
4872  team->t.t_proc_bind = new_proc_bind;
4873  __kmp_partition_places( team );
4874  }
4875 # else
4876  if ( team->t.t_proc_bind != new_proc_bind ) {
4877  team->t.t_proc_bind = new_proc_bind;
4878  }
4879 # endif /* KMP_AFFINITY_SUPPORTED */
4880 #endif /* OMP_40_ENABLED */
4881  }
4882  else if( team->t.t_nproc > new_nproc ) {
4883  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4884 
4885  team->t.t_size_changed = 1;
4886 #if KMP_NESTED_HOT_TEAMS
4887  if( __kmp_hot_teams_mode == 0 ) {
4888  // AC: saved number of threads should correspond to team's value in this mode,
4889  // can be bigger in mode 1, when hot team has some threads in reserve
4890  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4891  hot_teams[level].hot_team_nth = new_nproc;
4892 #endif // KMP_NESTED_HOT_TEAMS
4893  /* release the extra threads we don't need any more */
4894  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
4895  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4896  if ( __kmp_tasking_mode != tskm_immediate_exec) {
4897  // When decreasing team size, threads no longer in the team should unref task team.
4898  team->t.t_threads[f]->th.th_task_team = NULL;
4899  }
4900  __kmp_free_thread( team->t.t_threads[ f ] );
4901  team->t.t_threads[ f ] = NULL;
4902  }
4903 #if KMP_NESTED_HOT_TEAMS
4904  } // (__kmp_hot_teams_mode == 0)
4905 #endif // KMP_NESTED_HOT_TEAMS
4906  team->t.t_nproc = new_nproc;
4907  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4908  team->t.t_sched = new_icvs->sched;
4909  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4910 
4911  /* update the remaining threads */
4912  for(f = 0; f < new_nproc; ++f) {
4913  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4914  }
4915  // restore the current task state of the master thread: should be the implicit task
4916  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4917  0, team->t.t_threads[0], team ) );
4918 
4919  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4920 
4921 #ifdef KMP_DEBUG
4922  for ( f = 0; f < team->t.t_nproc; f++ ) {
4923  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4924  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4925  }
4926 #endif
4927 
4928 #if OMP_40_ENABLED
4929  team->t.t_proc_bind = new_proc_bind;
4930 # if KMP_AFFINITY_SUPPORTED
4931  __kmp_partition_places( team );
4932 # endif
4933 #endif
4934  }
4935  else { // team->t.t_nproc < new_nproc
4936 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4937  kmp_affin_mask_t *old_mask;
4938  if ( KMP_AFFINITY_CAPABLE() ) {
4939  KMP_CPU_ALLOC(old_mask);
4940  }
4941 #endif
4942 
4943  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4944 
4945  team->t.t_size_changed = 1;
4946 
4947 
4948 #if KMP_NESTED_HOT_TEAMS
4949  int avail_threads = hot_teams[level].hot_team_nth;
4950  if( new_nproc < avail_threads )
4951  avail_threads = new_nproc;
4952  kmp_info_t **other_threads = team->t.t_threads;
4953  for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4954  // Adjust barrier data of reserved threads (if any) of the team
4955  // Other data will be set in __kmp_initialize_info() below.
4956  int b;
4957  kmp_balign_t * balign = other_threads[f]->th.th_bar;
4958  for ( b = 0; b < bs_last_barrier; ++ b ) {
4959  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4960  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4961 #if USE_DEBUGGER
4962  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4963 #endif
4964  }
4965  }
4966  if( hot_teams[level].hot_team_nth >= new_nproc ) {
4967  // we have all needed threads in reserve, no need to allocate any
4968  // this only possible in mode 1, cannot have reserved threads in mode 0
4969  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4970  team->t.t_nproc = new_nproc; // just get reserved threads involved
4971  } else {
4972  // we may have some threads in reserve, but not enough
4973  team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4974  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
4975 #endif // KMP_NESTED_HOT_TEAMS
4976  if(team->t.t_max_nproc < new_nproc) {
4977  /* reallocate larger arrays */
4978  __kmp_reallocate_team_arrays(team, new_nproc);
4979  __kmp_reinitialize_team( team, new_icvs, NULL );
4980  }
4981 
4982 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4983  /* Temporarily set full mask for master thread before
4984  creation of workers. The reason is that workers inherit
4985  the affinity from master, so if a lot of workers are
4986  created on the single core quickly, they don't get
4987  a chance to set their own affinity for a long time.
4988  */
4989  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4990 #endif
4991 
4992  /* allocate new threads for the hot team */
4993  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
4994  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4995  KMP_DEBUG_ASSERT( new_worker );
4996  team->t.t_threads[ f ] = new_worker;
4997 
4998  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%llu, plain=%llu\n",
4999  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
5000  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5001  team->t.t_bar[bs_plain_barrier].b_arrived ) );
5002 
5003  { // Initialize barrier data for new threads.
5004  int b;
5005  kmp_balign_t * balign = new_worker->th.th_bar;
5006  for( b = 0; b < bs_last_barrier; ++ b ) {
5007  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5008  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5009 #if USE_DEBUGGER
5010  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5011 #endif
5012  }
5013  }
5014  }
5015 
5016 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5017  if ( KMP_AFFINITY_CAPABLE() ) {
5018  /* Restore initial master thread's affinity mask */
5019  __kmp_set_system_affinity( old_mask, TRUE );
5020  KMP_CPU_FREE(old_mask);
5021  }
5022 #endif
5023 #if KMP_NESTED_HOT_TEAMS
5024  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5025 #endif // KMP_NESTED_HOT_TEAMS
5026  /* make sure everyone is syncronized */
5027  int old_nproc = team->t.t_nproc; // save old value and use to update only new threads below
5028  __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
5029 
5030  /* reinitialize the threads */
5031  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5032  for (f=0; f < team->t.t_nproc; ++f)
5033  __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
5034  if (level) { // set th_task_state for new threads in nested hot team
5035  // __kmp_initialize_info() no longer zeroes th_task_state, so we should only need to set the
5036  // th_task_state for the new threads. th_task_state for master thread will not be accurate until
5037  // after this in __kmp_fork_call(), so we look to the master's memo_stack to get the correct value.
5038  for (f=old_nproc; f < team->t.t_nproc; ++f)
5039  team->t.t_threads[f]->th.th_task_state = team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5040  }
5041  else { // set th_task_state for new threads in non-nested hot team
5042  int old_state = team->t.t_threads[0]->th.th_task_state; // copy master's state
5043  for (f=old_nproc; f < team->t.t_nproc; ++f)
5044  team->t.t_threads[f]->th.th_task_state = old_state;
5045  }
5046 
5047 #ifdef KMP_DEBUG
5048  for ( f = 0; f < team->t.t_nproc; ++ f ) {
5049  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5050  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5051  }
5052 #endif
5053 
5054 #if OMP_40_ENABLED
5055  team->t.t_proc_bind = new_proc_bind;
5056 # if KMP_AFFINITY_SUPPORTED
5057  __kmp_partition_places( team );
5058 # endif
5059 #endif
5060  } // Check changes in number of threads
5061 
5062 #if OMP_40_ENABLED
5063  kmp_info_t *master = team->t.t_threads[0];
5064  if( master->th.th_teams_microtask ) {
5065  for( f = 1; f < new_nproc; ++f ) {
5066  // propagate teams construct specific info to workers
5067  kmp_info_t *thr = team->t.t_threads[f];
5068  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5069  thr->th.th_teams_level = master->th.th_teams_level;
5070  thr->th.th_teams_size = master->th.th_teams_size;
5071  }
5072  }
5073 #endif /* OMP_40_ENABLED */
5074 #if KMP_NESTED_HOT_TEAMS
5075  if( level ) {
5076  // Sync barrier state for nested hot teams, not needed for outermost hot team.
5077  for( f = 1; f < new_nproc; ++f ) {
5078  kmp_info_t *thr = team->t.t_threads[f];
5079  int b;
5080  kmp_balign_t * balign = thr->th.th_bar;
5081  for( b = 0; b < bs_last_barrier; ++ b ) {
5082  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5083  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5084 #if USE_DEBUGGER
5085  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5086 #endif
5087  }
5088  }
5089  }
5090 #endif // KMP_NESTED_HOT_TEAMS
5091 
5092  /* reallocate space for arguments if necessary */
5093  __kmp_alloc_argv_entries( argc, team, TRUE );
5094  team->t.t_argc = argc;
5095  //
5096  // The hot team re-uses the previous task team,
5097  // if untouched during the previous release->gather phase.
5098  //
5099 
5100  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5101 
5102 #if KMP_DEBUG
5103  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5104  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5105  team->t.t_task_team[0], team->t.t_task_team[1] ));
5106  }
5107 #endif
5108 
5109 #if OMPT_SUPPORT
5110  __ompt_team_assign_id(team, ompt_parallel_id);
5111 #endif
5112 
5113  KMP_MB();
5114 
5115  return team;
5116  }
5117 
5118  /* next, let's try to take one from the team pool */
5119  KMP_MB();
5120  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5121  {
5122  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5123  if ( team->t.t_max_nproc >= max_nproc ) {
5124  /* take this team from the team pool */
5125  __kmp_team_pool = team->t.t_next_pool;
5126 
5127  /* setup the team for fresh use */
5128  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5129 
5130  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5131  &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5132  team->t.t_task_team[0] = NULL;
5133  team->t.t_task_team[1] = NULL;
5134 
5135  /* reallocate space for arguments if necessary */
5136  __kmp_alloc_argv_entries( argc, team, TRUE );
5137  team->t.t_argc = argc;
5138 
5139  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5140  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5141  { // Initialize barrier data.
5142  int b;
5143  for ( b = 0; b < bs_last_barrier; ++ b) {
5144  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5145 #if USE_DEBUGGER
5146  team->t.t_bar[ b ].b_master_arrived = 0;
5147  team->t.t_bar[ b ].b_team_arrived = 0;
5148 #endif
5149  }
5150  }
5151 
5152 #if OMP_40_ENABLED
5153  team->t.t_proc_bind = new_proc_bind;
5154 #endif
5155 
5156  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5157 
5158 #if OMPT_SUPPORT
5159  __ompt_team_assign_id(team, ompt_parallel_id);
5160 #endif
5161 
5162  KMP_MB();
5163 
5164  return team;
5165  }
5166 
5167  /* reap team if it is too small, then loop back and check the next one */
5168  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5169  /* TODO: Use technique to find the right size hot-team, don't reap them */
5170  team = __kmp_reap_team( team );
5171  __kmp_team_pool = team;
5172  }
5173 
5174  /* nothing available in the pool, no matter, make a new team! */
5175  KMP_MB();
5176  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5177 
5178  /* and set it up */
5179  team->t.t_max_nproc = max_nproc;
5180  /* NOTE well, for some reason allocating one big buffer and dividing it
5181  * up seems to really hurt performance a lot on the P4, so, let's not use
5182  * this... */
5183  __kmp_allocate_team_arrays( team, max_nproc );
5184 
5185  KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5186  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5187 
5188  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5189  &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5190  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5191  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5192 
5193  if ( __kmp_storage_map ) {
5194  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5195  }
5196 
5197  /* allocate space for arguments */
5198  __kmp_alloc_argv_entries( argc, team, FALSE );
5199  team->t.t_argc = argc;
5200 
5201  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5202  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5203  { // Initialize barrier data.
5204  int b;
5205  for ( b = 0; b < bs_last_barrier; ++ b ) {
5206  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5207 #if USE_DEBUGGER
5208  team->t.t_bar[ b ].b_master_arrived = 0;
5209  team->t.t_bar[ b ].b_team_arrived = 0;
5210 #endif
5211  }
5212  }
5213 
5214 #if OMP_40_ENABLED
5215  team->t.t_proc_bind = new_proc_bind;
5216 #endif
5217 
5218 #if OMPT_SUPPORT
5219  __ompt_team_assign_id(team, ompt_parallel_id);
5220  team->t.ompt_serialized_team_info = NULL;
5221 #endif
5222 
5223  KMP_MB();
5224 
5225  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5226 
5227  return team;
5228 }
5229 
5230 /* TODO implement hot-teams at all levels */
5231 /* TODO implement lazy thread release on demand (disband request) */
5232 
5233 /* free the team. return it to the team pool. release all the threads
5234  * associated with it */
5235 void
5236 __kmp_free_team( kmp_root_t *root, kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master) )
5237 {
5238  int f;
5239  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5240 
5241  /* verify state */
5242  KMP_DEBUG_ASSERT( root );
5243  KMP_DEBUG_ASSERT( team );
5244  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5245  KMP_DEBUG_ASSERT( team->t.t_threads );
5246 
5247  int use_hot_team = team == root->r.r_hot_team;
5248 #if KMP_NESTED_HOT_TEAMS
5249  int level;
5250  kmp_hot_team_ptr_t *hot_teams;
5251  if( master ) {
5252  level = team->t.t_active_level - 1;
5253  if( master->th.th_teams_microtask ) { // in teams construct?
5254  if( master->th.th_teams_size.nteams > 1 ) {
5255  ++level; // level was not increased in teams construct for team_of_masters
5256  }
5257  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5258  master->th.th_teams_level == team->t.t_level ) {
5259  ++level; // level was not increased in teams construct for team_of_workers before the parallel
5260  } // team->t.t_level will be increased inside parallel
5261  }
5262  hot_teams = master->th.th_hot_teams;
5263  if( level < __kmp_hot_teams_max_level ) {
5264  KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5265  use_hot_team = 1;
5266  }
5267  }
5268 #endif // KMP_NESTED_HOT_TEAMS
5269 
5270  /* team is done working */
5271  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5272  team->t.t_copyin_counter = 0; // init counter for possible reuse
5273  // Do not reset pointer to parent team to NULL for hot teams.
5274 
5275  /* if we are non-hot team, release our threads */
5276  if( ! use_hot_team ) {
5277  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5278  // Delete task teams
5279  int tt_idx;
5280  for (tt_idx=0; tt_idx<2; ++tt_idx) {
5281  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5282  if ( task_team != NULL ) {
5283  for (f=0; f<team->t.t_nproc; ++f) { // Have all threads unref task teams
5284  team->t.t_threads[f]->th.th_task_team = NULL;
5285  }
5286  KA_TRACE( 20, ( "__kmp_free_team: T#%d deactivating task_team %p on team %d\n", __kmp_get_gtid(), task_team, team->t.t_id ) );
5287 #if KMP_NESTED_HOT_TEAMS
5288  __kmp_free_task_team( master, task_team );
5289 #endif
5290  team->t.t_task_team[tt_idx] = NULL;
5291  }
5292  }
5293  }
5294 
5295  // Reset pointer to parent team only for non-hot teams.
5296  team->t.t_parent = NULL;
5297 
5298 
5299  /* free the worker threads */
5300  for ( f = 1; f < team->t.t_nproc; ++ f ) {
5301  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5302  __kmp_free_thread( team->t.t_threads[ f ] );
5303  team->t.t_threads[ f ] = NULL;
5304  }
5305 
5306 
5307  /* put the team back in the team pool */
5308  /* TODO limit size of team pool, call reap_team if pool too large */
5309  team->t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
5310  __kmp_team_pool = (volatile kmp_team_t*) team;
5311  }
5312 
5313  KMP_MB();
5314 }
5315 
5316 
5317 /* reap the team. destroy it, reclaim all its resources and free its memory */
5318 kmp_team_t *
5319 __kmp_reap_team( kmp_team_t *team )
5320 {
5321  kmp_team_t *next_pool = team->t.t_next_pool;
5322 
5323  KMP_DEBUG_ASSERT( team );
5324  KMP_DEBUG_ASSERT( team->t.t_dispatch );
5325  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5326  KMP_DEBUG_ASSERT( team->t.t_threads );
5327  KMP_DEBUG_ASSERT( team->t.t_argv );
5328 
5329  /* TODO clean the threads that are a part of this? */
5330 
5331  /* free stuff */
5332 
5333  __kmp_free_team_arrays( team );
5334  if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5335  __kmp_free( (void*) team->t.t_argv );
5336  __kmp_free( team );
5337 
5338  KMP_MB();
5339  return next_pool;
5340 }
5341 
5342 //
5343 // Free the thread. Don't reap it, just place it on the pool of available
5344 // threads.
5345 //
5346 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5347 // binding for the affinity mechanism to be useful.
5348 //
5349 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5350 // However, we want to avoid a potential performance problem by always
5351 // scanning through the list to find the correct point at which to insert
5352 // the thread (potential N**2 behavior). To do this we keep track of the
5353 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5354 // With single-level parallelism, threads will always be added to the tail
5355 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5356 // parallelism, all bets are off and we may need to scan through the entire
5357 // free list.
5358 //
5359 // This change also has a potentially large performance benefit, for some
5360 // applications. Previously, as threads were freed from the hot team, they
5361 // would be placed back on the free list in inverse order. If the hot team
5362 // grew back to it's original size, then the freed thread would be placed
5363 // back on the hot team in reverse order. This could cause bad cache
5364 // locality problems on programs where the size of the hot team regularly
5365 // grew and shrunk.
5366 //
5367 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5368 //
5369 void
5370 __kmp_free_thread( kmp_info_t *this_th )
5371 {
5372  int gtid;
5373  kmp_info_t **scan;
5374 
5375  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5376  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5377 
5378  KMP_DEBUG_ASSERT( this_th );
5379 
5380  // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5381  int b;
5382  kmp_balign_t *balign = this_th->th.th_bar;
5383  for (b=0; b<bs_last_barrier; ++b) {
5384  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5385  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5386  balign[b].bb.team = NULL;
5387  }
5388  this_th->th.th_task_state = 0;
5389 
5390 
5391  /* put thread back on the free pool */
5392  TCW_PTR(this_th->th.th_team, NULL);
5393  TCW_PTR(this_th->th.th_root, NULL);
5394  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5395 
5396  //
5397  // If the __kmp_thread_pool_insert_pt is already past the new insert
5398  // point, then we need to re-scan the entire list.
5399  //
5400  gtid = this_th->th.th_info.ds.ds_gtid;
5401  if ( __kmp_thread_pool_insert_pt != NULL ) {
5402  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5403  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5404  __kmp_thread_pool_insert_pt = NULL;
5405  }
5406  }
5407 
5408  //
5409  // Scan down the list to find the place to insert the thread.
5410  // scan is the address of a link in the list, possibly the address of
5411  // __kmp_thread_pool itself.
5412  //
5413  // In the absence of nested parallism, the for loop will have 0 iterations.
5414  //
5415  if ( __kmp_thread_pool_insert_pt != NULL ) {
5416  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5417  }
5418  else {
5419  scan = (kmp_info_t **)&__kmp_thread_pool;
5420  }
5421  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5422  scan = &( (*scan)->th.th_next_pool ) );
5423 
5424  //
5425  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5426  // to its address.
5427  //
5428  TCW_PTR(this_th->th.th_next_pool, *scan);
5429  __kmp_thread_pool_insert_pt = *scan = this_th;
5430  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5431  || ( this_th->th.th_info.ds.ds_gtid
5432  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5433  TCW_4(this_th->th.th_in_pool, TRUE);
5434  __kmp_thread_pool_nth++;
5435 
5436  TCW_4(__kmp_nth, __kmp_nth - 1);
5437 
5438 #ifdef KMP_ADJUST_BLOCKTIME
5439  /* Adjust blocktime back to user setting or default if necessary */
5440  /* Middle initialization might never have occurred */
5441  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5442  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5443  if ( __kmp_nth <= __kmp_avail_proc ) {
5444  __kmp_zero_bt = FALSE;
5445  }
5446  }
5447 #endif /* KMP_ADJUST_BLOCKTIME */
5448 
5449  KMP_MB();
5450 }
5451 
5452 
5453 /* ------------------------------------------------------------------------ */
5454 
5455 void *
5456 __kmp_launch_thread( kmp_info_t *this_thr )
5457 {
5458  int gtid = this_thr->th.th_info.ds.ds_gtid;
5459 /* void *stack_data;*/
5460  kmp_team_t *(*volatile pteam);
5461 
5462  KMP_MB();
5463  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5464 
5465  if( __kmp_env_consistency_check ) {
5466  this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
5467  }
5468 
5469 #if OMPT_SUPPORT
5470  if (ompt_enabled) {
5471  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5472  this_thr->th.ompt_thread_info.wait_id = 0;
5473  this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5474  if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5475  __ompt_thread_begin(ompt_thread_worker, gtid);
5476  }
5477  }
5478 #endif
5479 
5480  /* This is the place where threads wait for work */
5481  while( ! TCR_4(__kmp_global.g.g_done) ) {
5482  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5483  KMP_MB();
5484 
5485  /* wait for work to do */
5486  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5487 
5488 #if OMPT_SUPPORT
5489  if (ompt_enabled) {
5490  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5491  }
5492 #endif
5493 
5494  /* No tid yet since not part of a team */
5495  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5496 
5497 #if OMPT_SUPPORT
5498  if (ompt_enabled) {
5499  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5500  }
5501 #endif
5502 
5503  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5504 
5505  /* have we been allocated? */
5506  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5507  /* we were just woken up, so run our new task */
5508  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5509  int rc;
5510  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5511  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5512 
5513  updateHWFPControl (*pteam);
5514 
5515 #if OMPT_SUPPORT
5516  if (ompt_enabled) {
5517  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5518  // Initialize OMPT task id for implicit task.
5519  int tid = __kmp_tid_from_gtid(gtid);
5520  (*pteam)->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id =
5521  __ompt_task_id_new(tid);
5522  }
5523 #endif
5524 
5525  KMP_STOP_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
5526  {
5527  KMP_TIME_DEVELOPER_BLOCK(USER_worker_invoke);
5528  rc = (*pteam)->t.t_invoke( gtid );
5529  }
5530  KMP_START_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
5531  KMP_ASSERT( rc );
5532 
5533 #if OMPT_SUPPORT
5534  if (ompt_enabled) {
5535  /* no frame set while outside task */
5536  int tid = __kmp_tid_from_gtid(gtid);
5537  (*pteam)->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
5538 
5539  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5540  }
5541 #endif
5542  KMP_MB();
5543  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5544  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5545  }
5546  /* join barrier after parallel region */
5547  __kmp_join_barrier( gtid );
5548  }
5549  }
5550  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5551 
5552 #if OMPT_SUPPORT
5553  if (ompt_enabled &&
5554  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5555  __ompt_thread_end(ompt_thread_worker, gtid);
5556  }
5557 #endif
5558 
5559  this_thr->th.th_task_team = NULL;
5560  /* run the destructors for the threadprivate data for this thread */
5561  __kmp_common_destroy_gtid( gtid );
5562 
5563  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5564  KMP_MB();
5565  return this_thr;
5566 }
5567 
5568 /* ------------------------------------------------------------------------ */
5569 /* ------------------------------------------------------------------------ */
5570 
5571 void
5572 __kmp_internal_end_dest( void *specific_gtid )
5573 {
5574  #if KMP_COMPILER_ICC
5575  #pragma warning( push )
5576  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
5577  #endif
5578  // Make sure no significant bits are lost
5579  int gtid = (kmp_intptr_t)specific_gtid - 1;
5580  #if KMP_COMPILER_ICC
5581  #pragma warning( pop )
5582  #endif
5583 
5584  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5585  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5586  * this is because 0 is reserved for the nothing-stored case */
5587 
5588  /* josh: One reason for setting the gtid specific data even when it is being
5589  destroyed by pthread is to allow gtid lookup through thread specific data
5590  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5591  that gets executed in the call to __kmp_internal_end_thread, actually
5592  gets the gtid through the thread specific data. Setting it here seems
5593  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5594  to run smoothly.
5595  todo: get rid of this after we remove the dependence on
5596  __kmp_gtid_get_specific
5597  */
5598  if(gtid >= 0 && KMP_UBER_GTID(gtid))
5599  __kmp_gtid_set_specific( gtid );
5600  #ifdef KMP_TDATA_GTID
5601  __kmp_gtid = gtid;
5602  #endif
5603  __kmp_internal_end_thread( gtid );
5604 }
5605 
5606 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5607 
5608 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5609 // perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
5610 // option in makefile.mk works fine.
5611 
5612 __attribute__(( destructor ))
5613 void
5614 __kmp_internal_end_dtor( void )
5615 {
5616  __kmp_internal_end_atexit();
5617 }
5618 
5619 void
5620 __kmp_internal_end_fini( void )
5621 {
5622  __kmp_internal_end_atexit();
5623 }
5624 
5625 #endif
5626 
5627 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5628 void
5629 __kmp_internal_end_atexit( void )
5630 {
5631  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5632  /* [Windows]
5633  josh: ideally, we want to completely shutdown the library in this atexit handler, but
5634  stat code that depends on thread specific data for gtid fails because that data becomes
5635  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5636  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
5637  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5638 
5639 // TODO: Can some of this comment about GVS be removed?
5640  I suspect that the offending stat code is executed when the calling thread tries to
5641  clean up a dead root thread's data structures, resulting in GVS code trying to close
5642  the GVS structures for that thread, but since the stat code uses
5643  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5644  cleaning up itself instead of another thread, it gets confused. This happens because
5645  allowing a thread to unregister and cleanup another thread is a recent modification for
5646  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
5647  thread may end up trying to unregister another thread only if thread death does not
5648  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
5649  specific data destructor function to detect thread death. For Windows dynamic, there
5650  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
5651  workaround is applicable only for Windows static stat library.
5652  */
5653  __kmp_internal_end_library( -1 );
5654  #if KMP_OS_WINDOWS
5655  __kmp_close_console();
5656  #endif
5657 }
5658 
5659 static void
5660 __kmp_reap_thread(
5661  kmp_info_t * thread,
5662  int is_root
5663 ) {
5664 
5665  // It is assumed __kmp_forkjoin_lock is acquired.
5666 
5667  int gtid;
5668 
5669  KMP_DEBUG_ASSERT( thread != NULL );
5670 
5671  gtid = thread->th.th_info.ds.ds_gtid;
5672 
5673  if ( ! is_root ) {
5674 
5675  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5676  /* Assume the threads are at the fork barrier here */
5677  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5678  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5679  kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5680  __kmp_release_64(&flag);
5681  }; // if
5682 
5683 
5684  // Terminate OS thread.
5685  __kmp_reap_worker( thread );
5686 
5687  //
5688  // The thread was killed asynchronously. If it was actively
5689  // spinning in the in the thread pool, decrement the global count.
5690  //
5691  // There is a small timing hole here - if the worker thread was
5692  // just waking up after sleeping in the pool, had reset it's
5693  // th_active_in_pool flag but not decremented the global counter
5694  // __kmp_thread_pool_active_nth yet, then the global counter
5695  // might not get updated.
5696  //
5697  // Currently, this can only happen as the library is unloaded,
5698  // so there are no harmful side effects.
5699  //
5700  if ( thread->th.th_active_in_pool ) {
5701  thread->th.th_active_in_pool = FALSE;
5702  KMP_TEST_THEN_DEC32(
5703  (kmp_int32 *) &__kmp_thread_pool_active_nth );
5704  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5705  }
5706 
5707  // Decrement # of [worker] threads in the pool.
5708  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5709  --__kmp_thread_pool_nth;
5710  }; // if
5711 
5712  // Free the fast memory for tasking
5713  #if USE_FAST_MEMORY
5714  __kmp_free_fast_memory( thread );
5715  #endif /* USE_FAST_MEMORY */
5716 
5717  __kmp_suspend_uninitialize_thread( thread );
5718 
5719  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5720  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5721 
5722  -- __kmp_all_nth;
5723  // __kmp_nth was decremented when thread is added to the pool.
5724 
5725 #ifdef KMP_ADJUST_BLOCKTIME
5726  /* Adjust blocktime back to user setting or default if necessary */
5727  /* Middle initialization might never have occurred */
5728  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5729  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5730  if ( __kmp_nth <= __kmp_avail_proc ) {
5731  __kmp_zero_bt = FALSE;
5732  }
5733  }
5734 #endif /* KMP_ADJUST_BLOCKTIME */
5735 
5736  /* free the memory being used */
5737  if( __kmp_env_consistency_check ) {
5738  if ( thread->th.th_cons ) {
5739  __kmp_free_cons_stack( thread->th.th_cons );
5740  thread->th.th_cons = NULL;
5741  }; // if
5742  }
5743 
5744  if ( thread->th.th_pri_common != NULL ) {
5745  __kmp_free( thread->th.th_pri_common );
5746  thread->th.th_pri_common = NULL;
5747  }; // if
5748 
5749  if (thread->th.th_task_state_memo_stack != NULL) {
5750  __kmp_free(thread->th.th_task_state_memo_stack);
5751  thread->th.th_task_state_memo_stack = NULL;
5752  }
5753 
5754  #if KMP_USE_BGET
5755  if ( thread->th.th_local.bget_data != NULL ) {
5756  __kmp_finalize_bget( thread );
5757  }; // if
5758  #endif
5759 
5760 #if KMP_AFFINITY_SUPPORTED
5761  if ( thread->th.th_affin_mask != NULL ) {
5762  KMP_CPU_FREE( thread->th.th_affin_mask );
5763  thread->th.th_affin_mask = NULL;
5764  }; // if
5765 #endif /* KMP_AFFINITY_SUPPORTED */
5766 
5767  __kmp_reap_team( thread->th.th_serial_team );
5768  thread->th.th_serial_team = NULL;
5769  __kmp_free( thread );
5770 
5771  KMP_MB();
5772 
5773 } // __kmp_reap_thread
5774 
5775 static void
5776 __kmp_internal_end(void)
5777 {
5778  int i;
5779 
5780  /* First, unregister the library */
5781  __kmp_unregister_library();
5782 
5783  #if KMP_OS_WINDOWS
5784  /* In Win static library, we can't tell when a root actually dies, so we
5785  reclaim the data structures for any root threads that have died but not
5786  unregistered themselves, in order to shut down cleanly.
5787  In Win dynamic library we also can't tell when a thread dies.
5788  */
5789  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5790  #endif
5791 
5792  for( i=0 ; i<__kmp_threads_capacity ; i++ )
5793  if( __kmp_root[i] )
5794  if( __kmp_root[i]->r.r_active )
5795  break;
5796  KMP_MB(); /* Flush all pending memory write invalidates. */
5797  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5798 
5799  if ( i < __kmp_threads_capacity ) {
5800  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5801  KMP_MB(); /* Flush all pending memory write invalidates. */
5802 
5803  //
5804  // Need to check that monitor was initialized before reaping it.
5805  // If we are called form __kmp_atfork_child (which sets
5806  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5807  // contain valid data, but it is only valid in the parent process,
5808  // not the child.
5809  //
5810  // New behavior (201008): instead of keying off of the flag
5811  // __kmp_init_parallel, the monitor thread creation is keyed off
5812  // of the new flag __kmp_init_monitor.
5813  //
5814  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5815  if ( TCR_4( __kmp_init_monitor ) ) {
5816  __kmp_reap_monitor( & __kmp_monitor );
5817  TCW_4( __kmp_init_monitor, 0 );
5818  }
5819  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5820  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5821  } else {
5822  /* TODO move this to cleanup code */
5823  #ifdef KMP_DEBUG
5824  /* make sure that everything has properly ended */
5825  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5826  if( __kmp_root[i] ) {
5827 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: there can be uber threads alive here
5828  KMP_ASSERT( ! __kmp_root[i]->r.r_active ); // TODO: can they be active?
5829  }
5830  }
5831  #endif
5832 
5833  KMP_MB();
5834 
5835  // Reap the worker threads.
5836  // This is valid for now, but be careful if threads are reaped sooner.
5837  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
5838  // Get the next thread from the pool.
5839  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5840  __kmp_thread_pool = thread->th.th_next_pool;
5841  // Reap it.
5842  thread->th.th_next_pool = NULL;
5843  thread->th.th_in_pool = FALSE;
5844  __kmp_reap_thread( thread, 0 );
5845  }; // while
5846  __kmp_thread_pool_insert_pt = NULL;
5847 
5848  // Reap teams.
5849  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
5850  // Get the next team from the pool.
5851  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5852  __kmp_team_pool = team->t.t_next_pool;
5853  // Reap it.
5854  team->t.t_next_pool = NULL;
5855  __kmp_reap_team( team );
5856  }; // while
5857 
5858  __kmp_reap_task_teams( );
5859 
5860  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5861  // TBD: Add some checking...
5862  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5863  }
5864 
5865  /* Make sure all threadprivate destructors get run by joining with all worker
5866  threads before resetting this flag */
5867  TCW_SYNC_4(__kmp_init_common, FALSE);
5868 
5869  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5870  KMP_MB();
5871 
5872  //
5873  // See note above: One of the possible fixes for CQ138434 / CQ140126
5874  //
5875  // FIXME: push both code fragments down and CSE them?
5876  // push them into __kmp_cleanup() ?
5877  //
5878  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5879  if ( TCR_4( __kmp_init_monitor ) ) {
5880  __kmp_reap_monitor( & __kmp_monitor );
5881  TCW_4( __kmp_init_monitor, 0 );
5882  }
5883  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5884  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5885 
5886  } /* else !__kmp_global.t_active */
5887  TCW_4(__kmp_init_gtid, FALSE);
5888  KMP_MB(); /* Flush all pending memory write invalidates. */
5889 
5890 
5891  __kmp_cleanup();
5892 #if OMPT_SUPPORT
5893  ompt_fini();
5894 #endif
5895 }
5896 
5897 void
5898 __kmp_internal_end_library( int gtid_req )
5899 {
5900  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5901  /* this shouldn't be a race condition because __kmp_internal_end() is the
5902  * only place to clear __kmp_serial_init */
5903  /* we'll check this later too, after we get the lock */
5904  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5905  // because the next check will work in any case.
5906  if( __kmp_global.g.g_abort ) {
5907  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5908  /* TODO abort? */
5909  return;
5910  }
5911  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5912  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5913  return;
5914  }
5915 
5916 
5917  KMP_MB(); /* Flush all pending memory write invalidates. */
5918 
5919  /* find out who we are and what we should do */
5920  {
5921  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5922  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
5923  if( gtid == KMP_GTID_SHUTDOWN ) {
5924  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5925  return;
5926  } else if( gtid == KMP_GTID_MONITOR ) {
5927  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5928  return;
5929  } else if( gtid == KMP_GTID_DNE ) {
5930  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5931  /* we don't know who we are, but we may still shutdown the library */
5932  } else if( KMP_UBER_GTID( gtid )) {
5933  /* unregister ourselves as an uber thread. gtid is no longer valid */
5934  if( __kmp_root[gtid]->r.r_active ) {
5935  __kmp_global.g.g_abort = -1;
5936  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5937  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5938  return;
5939  } else {
5940  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5941  __kmp_unregister_root_current_thread( gtid );
5942  }
5943  } else {
5944  /* worker threads may call this function through the atexit handler, if they call exit() */
5945  /* For now, skip the usual subsequent processing and just dump the debug buffer.
5946  TODO: do a thorough shutdown instead
5947  */
5948  #ifdef DUMP_DEBUG_ON_EXIT
5949  if ( __kmp_debug_buf )
5950  __kmp_dump_debug_buffer( );
5951  #endif
5952  return;
5953  }
5954  }
5955  /* synchronize the termination process */
5956  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5957 
5958  /* have we already finished */
5959  if( __kmp_global.g.g_abort ) {
5960  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5961  /* TODO abort? */
5962  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5963  return;
5964  }
5965  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5966  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5967  return;
5968  }
5969 
5970  /* We need this lock to enforce mutex between this reading of
5971  __kmp_threads_capacity and the writing by __kmp_register_root.
5972  Alternatively, we can use a counter of roots that is
5973  atomically updated by __kmp_get_global_thread_id_reg,
5974  __kmp_do_serial_initialize and __kmp_internal_end_*.
5975  */
5976  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
5977 
5978  /* now we can safely conduct the actual termination */
5979  __kmp_internal_end();
5980 
5981  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5982  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5983 
5984  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
5985 
5986  #ifdef DUMP_DEBUG_ON_EXIT
5987  if ( __kmp_debug_buf )
5988  __kmp_dump_debug_buffer();
5989  #endif
5990 
5991  #if KMP_OS_WINDOWS
5992  __kmp_close_console();
5993  #endif
5994 
5995  __kmp_fini_allocator();
5996 
5997 } // __kmp_internal_end_library
5998 
5999 void
6000 __kmp_internal_end_thread( int gtid_req )
6001 {
6002  int i;
6003 
6004  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6005  /* this shouldn't be a race condition because __kmp_internal_end() is the
6006  * only place to clear __kmp_serial_init */
6007  /* we'll check this later too, after we get the lock */
6008  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
6009  // because the next check will work in any case.
6010  if( __kmp_global.g.g_abort ) {
6011  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
6012  /* TODO abort? */
6013  return;
6014  }
6015  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6016  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
6017  return;
6018  }
6019 
6020  KMP_MB(); /* Flush all pending memory write invalidates. */
6021 
6022  /* find out who we are and what we should do */
6023  {
6024  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6025  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
6026  if( gtid == KMP_GTID_SHUTDOWN ) {
6027  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
6028  return;
6029  } else if( gtid == KMP_GTID_MONITOR ) {
6030  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
6031  return;
6032  } else if( gtid == KMP_GTID_DNE ) {
6033  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
6034  return;
6035  /* we don't know who we are */
6036  } else if( KMP_UBER_GTID( gtid )) {
6037  /* unregister ourselves as an uber thread. gtid is no longer valid */
6038  if( __kmp_root[gtid]->r.r_active ) {
6039  __kmp_global.g.g_abort = -1;
6040  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6041  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6042  return;
6043  } else {
6044  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6045  __kmp_unregister_root_current_thread( gtid );
6046  }
6047  } else {
6048  /* just a worker thread, let's leave */
6049  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6050 
6051  if ( gtid >= 0 ) {
6052  __kmp_threads[gtid]->th.th_task_team = NULL;
6053  }
6054 
6055  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6056  return;
6057  }
6058  }
6059  #if defined KMP_DYNAMIC_LIB
6060  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6061  // because we will better shutdown later in the library destructor.
6062  // The reason of this change is performance problem when non-openmp thread
6063  // in a loop forks and joins many openmp threads. We can save a lot of time
6064  // keeping worker threads alive until the program shutdown.
6065  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6066  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6067  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6068  return;
6069  #endif
6070  /* synchronize the termination process */
6071  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6072 
6073  /* have we already finished */
6074  if( __kmp_global.g.g_abort ) {
6075  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6076  /* TODO abort? */
6077  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6078  return;
6079  }
6080  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6081  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6082  return;
6083  }
6084 
6085  /* We need this lock to enforce mutex between this reading of
6086  __kmp_threads_capacity and the writing by __kmp_register_root.
6087  Alternatively, we can use a counter of roots that is
6088  atomically updated by __kmp_get_global_thread_id_reg,
6089  __kmp_do_serial_initialize and __kmp_internal_end_*.
6090  */
6091 
6092  /* should we finish the run-time? are all siblings done? */
6093  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6094 
6095  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6096  if ( KMP_UBER_GTID( i ) ) {
6097  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6098  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6099  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6100  return;
6101  };
6102  }
6103 
6104  /* now we can safely conduct the actual termination */
6105 
6106  __kmp_internal_end();
6107 
6108  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6109  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6110 
6111  KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6112 
6113  #ifdef DUMP_DEBUG_ON_EXIT
6114  if ( __kmp_debug_buf )
6115  __kmp_dump_debug_buffer();
6116  #endif
6117 } // __kmp_internal_end_thread
6118 
6119 // -------------------------------------------------------------------------------------------------
6120 // Library registration stuff.
6121 
6122 static long __kmp_registration_flag = 0;
6123  // Random value used to indicate library initialization.
6124 static char * __kmp_registration_str = NULL;
6125  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6126 
6127 
6128 static inline
6129 char *
6130 __kmp_reg_status_name() {
6131  /*
6132  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6133  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6134  the name of registered_lib_env env var can not be found, because the name will contain different pid.
6135  */
6136  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6137 } // __kmp_reg_status_get
6138 
6139 
6140 void
6141 __kmp_register_library_startup(
6142  void
6143 ) {
6144 
6145  char * name = __kmp_reg_status_name(); // Name of the environment variable.
6146  int done = 0;
6147  union {
6148  double dtime;
6149  long ltime;
6150  } time;
6151  #if KMP_OS_WINDOWS
6152  __kmp_initialize_system_tick();
6153  #endif
6154  __kmp_read_system_time( & time.dtime );
6155  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6156  __kmp_registration_str =
6157  __kmp_str_format(
6158  "%p-%lx-%s",
6159  & __kmp_registration_flag,
6160  __kmp_registration_flag,
6161  KMP_LIBRARY_FILE
6162  );
6163 
6164  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6165 
6166  while ( ! done ) {
6167 
6168  char * value = NULL; // Actual value of the environment variable.
6169 
6170  // Set environment variable, but do not overwrite if it is exist.
6171  __kmp_env_set( name, __kmp_registration_str, 0 );
6172  // Check the variable is written.
6173  value = __kmp_env_get( name );
6174  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6175 
6176  done = 1; // Ok, environment variable set successfully, exit the loop.
6177 
6178  } else {
6179 
6180  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6181  // Check whether it alive or dead.
6182  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6183  char * tail = value;
6184  char * flag_addr_str = NULL;
6185  char * flag_val_str = NULL;
6186  char const * file_name = NULL;
6187  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6188  __kmp_str_split( tail, '-', & flag_val_str, & tail );
6189  file_name = tail;
6190  if ( tail != NULL ) {
6191  long * flag_addr = 0;
6192  long flag_val = 0;
6193  KMP_SSCANF( flag_addr_str, "%p", & flag_addr );
6194  KMP_SSCANF( flag_val_str, "%lx", & flag_val );
6195  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6196  // First, check whether environment-encoded address is mapped into addr space.
6197  // If so, dereference it to see if it still has the right value.
6198 
6199  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6200  neighbor = 1;
6201  } else {
6202  // If not, then we know the other copy of the library is no longer running.
6203  neighbor = 2;
6204  }; // if
6205  }; // if
6206  }; // if
6207  switch ( neighbor ) {
6208  case 0 : // Cannot parse environment variable -- neighbor status unknown.
6209  // Assume it is the incompatible format of future version of the library.
6210  // Assume the other library is alive.
6211  // WARN( ... ); // TODO: Issue a warning.
6212  file_name = "unknown library";
6213  // Attention! Falling to the next case. That's intentional.
6214  case 1 : { // Neighbor is alive.
6215  // Check it is allowed.
6216  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6217  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6218  // That's not allowed. Issue fatal error.
6219  __kmp_msg(
6220  kmp_ms_fatal,
6221  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6222  KMP_HNT( DuplicateLibrary ),
6223  __kmp_msg_null
6224  );
6225  }; // if
6226  KMP_INTERNAL_FREE( duplicate_ok );
6227  __kmp_duplicate_library_ok = 1;
6228  done = 1; // Exit the loop.
6229  } break;
6230  case 2 : { // Neighbor is dead.
6231  // Clear the variable and try to register library again.
6232  __kmp_env_unset( name );
6233  } break;
6234  default : {
6235  KMP_DEBUG_ASSERT( 0 );
6236  } break;
6237  }; // switch
6238 
6239  }; // if
6240  KMP_INTERNAL_FREE( (void *) value );
6241 
6242  }; // while
6243  KMP_INTERNAL_FREE( (void *) name );
6244 
6245 } // func __kmp_register_library_startup
6246 
6247 
6248 void
6249 __kmp_unregister_library( void ) {
6250 
6251  char * name = __kmp_reg_status_name();
6252  char * value = __kmp_env_get( name );
6253 
6254  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6255  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
6256  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6257  // Ok, this is our variable. Delete it.
6258  __kmp_env_unset( name );
6259  }; // if
6260 
6261  KMP_INTERNAL_FREE( __kmp_registration_str );
6262  KMP_INTERNAL_FREE( value );
6263  KMP_INTERNAL_FREE( name );
6264 
6265  __kmp_registration_flag = 0;
6266  __kmp_registration_str = NULL;
6267 
6268 } // __kmp_unregister_library
6269 
6270 
6271 // End of Library registration stuff.
6272 // -------------------------------------------------------------------------------------------------
6273 
6274 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6275 
6276 static void __kmp_check_mic_type()
6277 {
6278  kmp_cpuid_t cpuid_state = {0};
6279  kmp_cpuid_t * cs_p = &cpuid_state;
6280  __kmp_x86_cpuid(1, 0, cs_p);
6281  // We don't support mic1 at the moment
6282  if( (cs_p->eax & 0xff0) == 0xB10 ) {
6283  __kmp_mic_type = mic2;
6284  } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6285  __kmp_mic_type = mic3;
6286  } else {
6287  __kmp_mic_type = non_mic;
6288  }
6289 }
6290 
6291 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6292 
6293 static void
6294 __kmp_do_serial_initialize( void )
6295 {
6296  int i, gtid;
6297  int size;
6298 
6299  KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6300 
6301  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6302  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6303  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6304  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6305  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6306 
6307 #if OMPT_SUPPORT
6308  ompt_pre_init();
6309 #endif
6310 
6311  __kmp_validate_locks();
6312 
6313  /* Initialize internal memory allocator */
6314  __kmp_init_allocator();
6315 
6316  /* Register the library startup via an environment variable
6317  and check to see whether another copy of the library is already
6318  registered. */
6319 
6320  __kmp_register_library_startup( );
6321 
6322  /* TODO reinitialization of library */
6323  if( TCR_4(__kmp_global.g.g_done) ) {
6324  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6325  }
6326 
6327  __kmp_global.g.g_abort = 0;
6328  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6329 
6330  /* initialize the locks */
6331 #if KMP_USE_ADAPTIVE_LOCKS
6332 #if KMP_DEBUG_ADAPTIVE_LOCKS
6333  __kmp_init_speculative_stats();
6334 #endif
6335 #endif
6336 #if KMP_STATS_ENABLED
6337  __kmp_init_tas_lock( & __kmp_stats_lock );
6338 #endif
6339  __kmp_init_lock( & __kmp_global_lock );
6340  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6341  __kmp_init_lock( & __kmp_debug_lock );
6342  __kmp_init_atomic_lock( & __kmp_atomic_lock );
6343  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
6344  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
6345  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
6346  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
6347  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
6348  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
6349  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
6350  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6351  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6352  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6353  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6354  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6355  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
6356  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
6357  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
6358  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6359 
6360  /* conduct initialization and initial setup of configuration */
6361 
6362  __kmp_runtime_initialize();
6363 
6364 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6365  __kmp_check_mic_type();
6366 #endif
6367 
6368  // Some global variable initialization moved here from kmp_env_initialize()
6369 #ifdef KMP_DEBUG
6370  kmp_diag = 0;
6371 #endif
6372  __kmp_abort_delay = 0;
6373 
6374  // From __kmp_init_dflt_team_nth()
6375  /* assume the entire machine will be used */
6376  __kmp_dflt_team_nth_ub = __kmp_xproc;
6377  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6378  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6379  }
6380  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6381  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6382  }
6383  __kmp_max_nth = __kmp_sys_max_nth;
6384 
6385  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6386  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6387  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6388  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6389  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6390  __kmp_library = library_throughput;
6391  // From KMP_SCHEDULE initialization
6392  __kmp_static = kmp_sch_static_balanced;
6393  // AC: do not use analytical here, because it is non-monotonous
6394  //__kmp_guided = kmp_sch_guided_iterative_chunked;
6395  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6396  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6397  // control parts
6398  #if KMP_FAST_REDUCTION_BARRIER
6399  #define kmp_reduction_barrier_gather_bb ((int)1)
6400  #define kmp_reduction_barrier_release_bb ((int)1)
6401  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6402  #define kmp_reduction_barrier_release_pat bp_hyper_bar
6403  #endif // KMP_FAST_REDUCTION_BARRIER
6404  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6405  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6406  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6407  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6408  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6409  #if KMP_FAST_REDUCTION_BARRIER
6410  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6411  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6412  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6413  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6414  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6415  }
6416  #endif // KMP_FAST_REDUCTION_BARRIER
6417  }
6418  #if KMP_FAST_REDUCTION_BARRIER
6419  #undef kmp_reduction_barrier_release_pat
6420  #undef kmp_reduction_barrier_gather_pat
6421  #undef kmp_reduction_barrier_release_bb
6422  #undef kmp_reduction_barrier_gather_bb
6423  #endif // KMP_FAST_REDUCTION_BARRIER
6424 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6425  if (__kmp_mic_type == mic2) { // KNC
6426  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6427  __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3; // plain gather
6428  __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1; // forkjoin release
6429  __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6430  __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6431  }
6432 #if KMP_FAST_REDUCTION_BARRIER
6433  if (__kmp_mic_type == mic2) { // KNC
6434  __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6435  __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6436  }
6437 #endif
6438 #endif
6439 
6440  // From KMP_CHECKS initialization
6441 #ifdef KMP_DEBUG
6442  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6443 #else
6444  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6445 #endif
6446 
6447  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6448  __kmp_foreign_tp = TRUE;
6449 
6450  __kmp_global.g.g_dynamic = FALSE;
6451  __kmp_global.g.g_dynamic_mode = dynamic_default;
6452 
6453  __kmp_env_initialize( NULL );
6454 
6455  // Print all messages in message catalog for testing purposes.
6456  #ifdef KMP_DEBUG
6457  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6458  if ( __kmp_str_match_true( val ) ) {
6459  kmp_str_buf_t buffer;
6460  __kmp_str_buf_init( & buffer );
6461  __kmp_i18n_dump_catalog( & buffer );
6462  __kmp_printf( "%s", buffer.str );
6463  __kmp_str_buf_free( & buffer );
6464  }; // if
6465  __kmp_env_free( & val );
6466  #endif
6467 
6468  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6469  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6470  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6471 
6472  // If the library is shut down properly, both pools must be NULL. Just in case, set them
6473  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6474  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6475  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6476  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
6477  __kmp_thread_pool = NULL;
6478  __kmp_thread_pool_insert_pt = NULL;
6479  __kmp_team_pool = NULL;
6480 
6481  /* Allocate all of the variable sized records */
6482  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6483  /* Since allocation is cache-aligned, just add extra padding at the end */
6484  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6485  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6486  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6487 
6488  /* init thread counts */
6489  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6490  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
6491  __kmp_all_nth = 0;
6492  __kmp_nth = 0;
6493 
6494  /* setup the uber master thread and hierarchy */
6495  gtid = __kmp_register_root( TRUE );
6496  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
6497  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6498  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6499 
6500  KMP_MB(); /* Flush all pending memory write invalidates. */
6501 
6502  __kmp_common_initialize();
6503 
6504  #if KMP_OS_UNIX
6505  /* invoke the child fork handler */
6506  __kmp_register_atfork();
6507  #endif
6508 
6509  #if ! defined KMP_DYNAMIC_LIB
6510  {
6511  /* Invoke the exit handler when the program finishes, only for static library.
6512  For dynamic library, we already have _fini and DllMain.
6513  */
6514  int rc = atexit( __kmp_internal_end_atexit );
6515  if ( rc != 0 ) {
6516  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6517  }; // if
6518  }
6519  #endif
6520 
6521  #if KMP_HANDLE_SIGNALS
6522  #if KMP_OS_UNIX
6523  /* NOTE: make sure that this is called before the user installs
6524  * their own signal handlers so that the user handlers
6525  * are called first. this way they can return false,
6526  * not call our handler, avoid terminating the library,
6527  * and continue execution where they left off. */
6528  __kmp_install_signals( FALSE );
6529  #endif /* KMP_OS_UNIX */
6530  #if KMP_OS_WINDOWS
6531  __kmp_install_signals( TRUE );
6532  #endif /* KMP_OS_WINDOWS */
6533  #endif
6534 
6535  /* we have finished the serial initialization */
6536  __kmp_init_counter ++;
6537 
6538  __kmp_init_serial = TRUE;
6539 
6540  if (__kmp_settings) {
6541  __kmp_env_print();
6542  }
6543 
6544 #if OMP_40_ENABLED
6545  if (__kmp_display_env || __kmp_display_env_verbose) {
6546  __kmp_env_print_2();
6547  }
6548 #endif // OMP_40_ENABLED
6549 
6550 #if OMPT_SUPPORT
6551  ompt_post_init();
6552 #endif
6553 
6554  KMP_MB();
6555 
6556  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6557 }
6558 
6559 void
6560 __kmp_serial_initialize( void )
6561 {
6562  if ( __kmp_init_serial ) {
6563  return;
6564  }
6565  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6566  if ( __kmp_init_serial ) {
6567  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6568  return;
6569  }
6570  __kmp_do_serial_initialize();
6571  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6572 }
6573 
6574 static void
6575 __kmp_do_middle_initialize( void )
6576 {
6577  int i, j;
6578  int prev_dflt_team_nth;
6579 
6580  if( !__kmp_init_serial ) {
6581  __kmp_do_serial_initialize();
6582  }
6583 
6584  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6585 
6586  //
6587  // Save the previous value for the __kmp_dflt_team_nth so that
6588  // we can avoid some reinitialization if it hasn't changed.
6589  //
6590  prev_dflt_team_nth = __kmp_dflt_team_nth;
6591 
6592 #if KMP_AFFINITY_SUPPORTED
6593  //
6594  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6595  // number of cores on the machine.
6596  //
6597  __kmp_affinity_initialize();
6598 
6599  //
6600  // Run through the __kmp_threads array and set the affinity mask
6601  // for each root thread that is currently registered with the RTL.
6602  //
6603  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6604  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6605  __kmp_affinity_set_init_mask( i, TRUE );
6606  }
6607  }
6608 #endif /* KMP_AFFINITY_SUPPORTED */
6609 
6610  KMP_ASSERT( __kmp_xproc > 0 );
6611  if ( __kmp_avail_proc == 0 ) {
6612  __kmp_avail_proc = __kmp_xproc;
6613  }
6614 
6615  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6616  j = 0;
6617  while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6618  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6619  j++;
6620  }
6621 
6622  if ( __kmp_dflt_team_nth == 0 ) {
6623 #ifdef KMP_DFLT_NTH_CORES
6624  //
6625  // Default #threads = #cores
6626  //
6627  __kmp_dflt_team_nth = __kmp_ncores;
6628  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6629  __kmp_dflt_team_nth ) );
6630 #else
6631  //
6632  // Default #threads = #available OS procs
6633  //
6634  __kmp_dflt_team_nth = __kmp_avail_proc;
6635  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6636  __kmp_dflt_team_nth ) );
6637 #endif /* KMP_DFLT_NTH_CORES */
6638  }
6639 
6640  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6641  __kmp_dflt_team_nth = KMP_MIN_NTH;
6642  }
6643  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6644  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6645  }
6646 
6647  //
6648  // There's no harm in continuing if the following check fails,
6649  // but it indicates an error in the previous logic.
6650  //
6651  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6652 
6653  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6654  //
6655  // Run through the __kmp_threads array and set the num threads icv
6656  // for each root thread that is currently registered with the RTL
6657  // (which has not already explicitly set its nthreads-var with a
6658  // call to omp_set_num_threads()).
6659  //
6660  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6661  kmp_info_t *thread = __kmp_threads[ i ];
6662  if ( thread == NULL ) continue;
6663  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6664 
6665  set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6666  }
6667  }
6668  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6669  __kmp_dflt_team_nth) );
6670 
6671 #ifdef KMP_ADJUST_BLOCKTIME
6672  /* Adjust blocktime to zero if necessary */
6673  /* now that __kmp_avail_proc is set */
6674  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6675  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6676  if ( __kmp_nth > __kmp_avail_proc ) {
6677  __kmp_zero_bt = TRUE;
6678  }
6679  }
6680 #endif /* KMP_ADJUST_BLOCKTIME */
6681 
6682  /* we have finished middle initialization */
6683  TCW_SYNC_4(__kmp_init_middle, TRUE);
6684 
6685  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6686 }
6687 
6688 void
6689 __kmp_middle_initialize( void )
6690 {
6691  if ( __kmp_init_middle ) {
6692  return;
6693  }
6694  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6695  if ( __kmp_init_middle ) {
6696  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6697  return;
6698  }
6699  __kmp_do_middle_initialize();
6700  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6701 }
6702 
6703 void
6704 __kmp_parallel_initialize( void )
6705 {
6706  int gtid = __kmp_entry_gtid(); // this might be a new root
6707 
6708  /* synchronize parallel initialization (for sibling) */
6709  if( TCR_4(__kmp_init_parallel) ) return;
6710  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6711  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6712 
6713  /* TODO reinitialization after we have already shut down */
6714  if( TCR_4(__kmp_global.g.g_done) ) {
6715  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6716  __kmp_infinite_loop();
6717  }
6718 
6719  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6720  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
6721  */
6722  if( !__kmp_init_middle ) {
6723  __kmp_do_middle_initialize();
6724  }
6725 
6726  /* begin initialization */
6727  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6728  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6729 
6730 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6731  //
6732  // Save the FP control regs.
6733  // Worker threads will set theirs to these values at thread startup.
6734  //
6735  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6736  __kmp_store_mxcsr( &__kmp_init_mxcsr );
6737  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6738 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6739 
6740 #if KMP_OS_UNIX
6741 # if KMP_HANDLE_SIGNALS
6742  /* must be after __kmp_serial_initialize */
6743  __kmp_install_signals( TRUE );
6744 # endif
6745 #endif
6746 
6747  __kmp_suspend_initialize();
6748 
6749 # if defined(USE_LOAD_BALANCE)
6750  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6751  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6752  }
6753 #else
6754  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6755  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6756  }
6757 #endif
6758 
6759  if ( __kmp_version ) {
6760  __kmp_print_version_2();
6761  }
6762 
6763  /* we have finished parallel initialization */
6764  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6765 
6766  KMP_MB();
6767  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6768 
6769  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6770 }
6771 
6772 
6773 /* ------------------------------------------------------------------------ */
6774 
6775 void
6776 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6777  kmp_team_t *team )
6778 {
6779  kmp_disp_t *dispatch;
6780 
6781  KMP_MB();
6782 
6783  /* none of the threads have encountered any constructs, yet. */
6784  this_thr->th.th_local.this_construct = 0;
6785 #if KMP_CACHE_MANAGE
6786  KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6787 #endif /* KMP_CACHE_MANAGE */
6788  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6789  KMP_DEBUG_ASSERT( dispatch );
6790  KMP_DEBUG_ASSERT( team->t.t_dispatch );
6791  //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6792 
6793  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6794 
6795  if( __kmp_env_consistency_check )
6796  __kmp_push_parallel( gtid, team->t.t_ident );
6797 
6798  KMP_MB(); /* Flush all pending memory write invalidates. */
6799 }
6800 
6801 void
6802 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6803  kmp_team_t *team )
6804 {
6805  if( __kmp_env_consistency_check )
6806  __kmp_pop_parallel( gtid, team->t.t_ident );
6807 }
6808 
6809 int
6810 __kmp_invoke_task_func( int gtid )
6811 {
6812  int rc;
6813  int tid = __kmp_tid_from_gtid( gtid );
6814  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6815  kmp_team_t *team = this_thr->th.th_team;
6816 
6817  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6818 #if USE_ITT_BUILD
6819  if ( __itt_stack_caller_create_ptr ) {
6820  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6821  }
6822 #endif /* USE_ITT_BUILD */
6823 #if INCLUDE_SSC_MARKS
6824  SSC_MARK_INVOKING();
6825 #endif
6826 
6827 #if OMPT_SUPPORT
6828  void *dummy;
6829  void **exit_runtime_p;
6830  ompt_task_id_t my_task_id;
6831  ompt_parallel_id_t my_parallel_id;
6832 
6833  if (ompt_enabled) {
6834  exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6835  ompt_task_info.frame.exit_runtime_frame);
6836  } else {
6837  exit_runtime_p = &dummy;
6838  }
6839 
6840 #if OMPT_TRACE
6841  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6842  my_parallel_id = team->t.ompt_team_info.parallel_id;
6843  if (ompt_enabled &&
6844  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6845  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6846  my_parallel_id, my_task_id);
6847  }
6848 #endif
6849 #endif
6850 
6851  {
6852  KMP_TIME_BLOCK(OMP_work);
6853  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6854  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6855 #if OMPT_SUPPORT
6856  , exit_runtime_p
6857 #endif
6858  );
6859  }
6860 
6861 #if OMPT_SUPPORT && OMPT_TRACE
6862  if (ompt_enabled) {
6863  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
6864  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
6865  my_parallel_id, my_task_id);
6866  }
6867  // the implicit task is not dead yet, so we can't clear its task id here
6868  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
6869  }
6870 #endif
6871 
6872 #if USE_ITT_BUILD
6873  if ( __itt_stack_caller_create_ptr ) {
6874  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6875  }
6876 #endif /* USE_ITT_BUILD */
6877  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6878 
6879  return rc;
6880 }
6881 
6882 #if OMP_40_ENABLED
6883 void
6884 __kmp_teams_master( int gtid )
6885 {
6886  // This routine is called by all master threads in teams construct
6887  kmp_info_t *thr = __kmp_threads[ gtid ];
6888  kmp_team_t *team = thr->th.th_team;
6889  ident_t *loc = team->t.t_ident;
6890  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6891  KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6892  KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6893  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6894  gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6895  // Launch league of teams now, but not let workers execute
6896  // (they hang on fork barrier until next parallel)
6897 #if INCLUDE_SSC_MARKS
6898  SSC_MARK_FORKING();
6899 #endif
6900  __kmp_fork_call( loc, gtid, fork_context_intel,
6901  team->t.t_argc,
6902 #if OMPT_SUPPORT
6903  (void *)thr->th.th_teams_microtask, // "unwrapped" task
6904 #endif
6905  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6906  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6907  NULL );
6908 #if INCLUDE_SSC_MARKS
6909  SSC_MARK_JOINING();
6910 #endif
6911 
6912  // AC: last parameter "1" eliminates join barrier which won't work because
6913  // worker threads are in a fork barrier waiting for more parallel regions
6914  __kmp_join_call( loc, gtid
6915 #if OMPT_SUPPORT
6916  , fork_context_intel
6917 #endif
6918  , 1 );
6919 }
6920 
6921 int
6922 __kmp_invoke_teams_master( int gtid )
6923 {
6924  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6925  kmp_team_t *team = this_thr->th.th_team;
6926  #if KMP_DEBUG
6927  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6928  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6929  #endif
6930  __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6931  __kmp_teams_master( gtid );
6932  __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6933  return 1;
6934 }
6935 #endif /* OMP_40_ENABLED */
6936 
6937 /* this sets the requested number of threads for the next parallel region
6938  * encountered by this team */
6939 /* since this should be enclosed in the forkjoin critical section it
6940  * should avoid race conditions with assymmetrical nested parallelism */
6941 
6942 void
6943 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6944 {
6945  kmp_info_t *thr = __kmp_threads[gtid];
6946 
6947  if( num_threads > 0 )
6948  thr->th.th_set_nproc = num_threads;
6949 }
6950 
6951 #if OMP_40_ENABLED
6952 
6953 /* this sets the requested number of teams for the teams region and/or
6954  * the number of threads for the next parallel region encountered */
6955 void
6956 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6957 {
6958  kmp_info_t *thr = __kmp_threads[gtid];
6959  KMP_DEBUG_ASSERT(num_teams >= 0);
6960  KMP_DEBUG_ASSERT(num_threads >= 0);
6961 
6962  if( num_teams == 0 )
6963  num_teams = 1; // default number of teams is 1.
6964  if( num_teams > __kmp_max_nth ) { // if too many teams requested?
6965  if ( !__kmp_reserve_warn ) {
6966  __kmp_reserve_warn = 1;
6967  __kmp_msg(
6968  kmp_ms_warning,
6969  KMP_MSG( CantFormThrTeam, num_teams, __kmp_max_nth ),
6970  KMP_HNT( Unset_ALL_THREADS ),
6971  __kmp_msg_null
6972  );
6973  }
6974  num_teams = __kmp_max_nth;
6975  }
6976  // Set number of teams (number of threads in the outer "parallel" of the teams)
6977  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6978 
6979  // Remember the number of threads for inner parallel regions
6980  if( num_threads == 0 ) {
6981  if( !TCR_4(__kmp_init_middle) )
6982  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
6983  num_threads = __kmp_avail_proc / num_teams;
6984  if( num_teams * num_threads > __kmp_max_nth ) {
6985  // adjust num_threads w/o warning as it is not user setting
6986  num_threads = __kmp_max_nth / num_teams;
6987  }
6988  } else {
6989  if( num_teams * num_threads > __kmp_max_nth ) {
6990  int new_threads = __kmp_max_nth / num_teams;
6991  if ( !__kmp_reserve_warn ) { // user asked for too many threads
6992  __kmp_reserve_warn = 1; // that conflicts with OMP_THREAD_LIMIT
6993  __kmp_msg(
6994  kmp_ms_warning,
6995  KMP_MSG( CantFormThrTeam, num_threads, new_threads ),
6996  KMP_HNT( Unset_ALL_THREADS ),
6997  __kmp_msg_null
6998  );
6999  }
7000  num_threads = new_threads;
7001  }
7002  }
7003  thr->th.th_teams_size.nth = num_threads;
7004 }
7005 
7006 
7007 //
7008 // Set the proc_bind var to use in the following parallel region.
7009 //
7010 void
7011 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
7012 {
7013  kmp_info_t *thr = __kmp_threads[gtid];
7014  thr->th.th_set_proc_bind = proc_bind;
7015 }
7016 
7017 #endif /* OMP_40_ENABLED */
7018 
7019 /* Launch the worker threads into the microtask. */
7020 
7021 void
7022 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
7023 {
7024  kmp_info_t *this_thr = __kmp_threads[gtid];
7025 
7026 #ifdef KMP_DEBUG
7027  int f;
7028 #endif /* KMP_DEBUG */
7029 
7030  KMP_DEBUG_ASSERT( team );
7031  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7032  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7033  KMP_MB(); /* Flush all pending memory write invalidates. */
7034 
7035  team->t.t_construct = 0; /* no single directives seen yet */
7036  team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7037 
7038  /* Reset the identifiers on the dispatch buffer */
7039  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
7040  if ( team->t.t_max_nproc > 1 ) {
7041  int i;
7042  for (i = 0; i < KMP_MAX_DISP_BUF; ++i)
7043  team->t.t_disp_buffer[ i ].buffer_index = i;
7044  } else {
7045  team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7046  }
7047 
7048  KMP_MB(); /* Flush all pending memory write invalidates. */
7049  KMP_ASSERT( this_thr->th.th_team == team );
7050 
7051 #ifdef KMP_DEBUG
7052  for( f=0 ; f<team->t.t_nproc ; f++ ) {
7053  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7054  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7055  }
7056 #endif /* KMP_DEBUG */
7057 
7058  /* release the worker threads so they may begin working */
7059  __kmp_fork_barrier( gtid, 0 );
7060 }
7061 
7062 
7063 void
7064 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7065 {
7066  kmp_info_t *this_thr = __kmp_threads[gtid];
7067 
7068  KMP_DEBUG_ASSERT( team );
7069  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7070  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7071  KMP_MB(); /* Flush all pending memory write invalidates. */
7072 
7073  /* Join barrier after fork */
7074 
7075 #ifdef KMP_DEBUG
7076  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7077  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7078  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7079  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7080  __kmp_print_structure();
7081  }
7082  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7083  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7084 #endif /* KMP_DEBUG */
7085 
7086  __kmp_join_barrier( gtid ); /* wait for everyone */
7087 
7088  KMP_MB(); /* Flush all pending memory write invalidates. */
7089  KMP_ASSERT( this_thr->th.th_team == team );
7090 }
7091 
7092 
7093 /* ------------------------------------------------------------------------ */
7094 /* ------------------------------------------------------------------------ */
7095 
7096 #ifdef USE_LOAD_BALANCE
7097 
7098 //
7099 // Return the worker threads actively spinning in the hot team, if we
7100 // are at the outermost level of parallelism. Otherwise, return 0.
7101 //
7102 static int
7103 __kmp_active_hot_team_nproc( kmp_root_t *root )
7104 {
7105  int i;
7106  int retval;
7107  kmp_team_t *hot_team;
7108 
7109  if ( root->r.r_active ) {
7110  return 0;
7111  }
7112  hot_team = root->r.r_hot_team;
7113  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7114  return hot_team->t.t_nproc - 1; // Don't count master thread
7115  }
7116 
7117  //
7118  // Skip the master thread - it is accounted for elsewhere.
7119  //
7120  retval = 0;
7121  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7122  if ( hot_team->t.t_threads[i]->th.th_active ) {
7123  retval++;
7124  }
7125  }
7126  return retval;
7127 }
7128 
7129 //
7130 // Perform an automatic adjustment to the number of
7131 // threads used by the next parallel region.
7132 //
7133 static int
7134 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7135 {
7136  int retval;
7137  int pool_active;
7138  int hot_team_active;
7139  int team_curr_active;
7140  int system_active;
7141 
7142  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7143  root, set_nproc ) );
7144  KMP_DEBUG_ASSERT( root );
7145  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7146  KMP_DEBUG_ASSERT( set_nproc > 1 );
7147 
7148  if ( set_nproc == 1) {
7149  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7150  return 1;
7151  }
7152 
7153  //
7154  // Threads that are active in the thread pool, active in the hot team
7155  // for this particular root (if we are at the outer par level), and
7156  // the currently executing thread (to become the master) are available
7157  // to add to the new team, but are currently contributing to the system
7158  // load, and must be accounted for.
7159  //
7160  pool_active = TCR_4(__kmp_thread_pool_active_nth);
7161  hot_team_active = __kmp_active_hot_team_nproc( root );
7162  team_curr_active = pool_active + hot_team_active + 1;
7163 
7164  //
7165  // Check the system load.
7166  //
7167  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7168  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7169  system_active, pool_active, hot_team_active ) );
7170 
7171  if ( system_active < 0 ) {
7172  //
7173  // There was an error reading the necessary info from /proc,
7174  // so use the thread limit algorithm instead. Once we set
7175  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7176  // we shouldn't wind up getting back here.
7177  //
7178  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7179  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7180 
7181  //
7182  // Make this call behave like the thread limit algorithm.
7183  //
7184  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7185  : root->r.r_hot_team->t.t_nproc);
7186  if ( retval > set_nproc ) {
7187  retval = set_nproc;
7188  }
7189  if ( retval < KMP_MIN_NTH ) {
7190  retval = KMP_MIN_NTH;
7191  }
7192 
7193  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7194  return retval;
7195  }
7196 
7197  //
7198  // There is a slight delay in the load balance algorithm in detecting
7199  // new running procs. The real system load at this instant should be
7200  // at least as large as the #active omp thread that are available to
7201  // add to the team.
7202  //
7203  if ( system_active < team_curr_active ) {
7204  system_active = team_curr_active;
7205  }
7206  retval = __kmp_avail_proc - system_active + team_curr_active;
7207  if ( retval > set_nproc ) {
7208  retval = set_nproc;
7209  }
7210  if ( retval < KMP_MIN_NTH ) {
7211  retval = KMP_MIN_NTH;
7212  }
7213 
7214  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7215  return retval;
7216 } // __kmp_load_balance_nproc()
7217 
7218 #endif /* USE_LOAD_BALANCE */
7219 
7220 /* ------------------------------------------------------------------------ */
7221 /* ------------------------------------------------------------------------ */
7222 
7223 /* NOTE: this is called with the __kmp_init_lock held */
7224 void
7225 __kmp_cleanup( void )
7226 {
7227  int f;
7228 
7229  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7230 
7231  if (TCR_4(__kmp_init_parallel)) {
7232 #if KMP_HANDLE_SIGNALS
7233  __kmp_remove_signals();
7234 #endif
7235  TCW_4(__kmp_init_parallel, FALSE);
7236  }
7237 
7238  if (TCR_4(__kmp_init_middle)) {
7239 #if KMP_AFFINITY_SUPPORTED
7240  __kmp_affinity_uninitialize();
7241 #endif /* KMP_AFFINITY_SUPPORTED */
7242  __kmp_cleanup_hierarchy();
7243  TCW_4(__kmp_init_middle, FALSE);
7244  }
7245 
7246  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7247 
7248  if (__kmp_init_serial) {
7249  __kmp_runtime_destroy();
7250  __kmp_init_serial = FALSE;
7251  }
7252 
7253  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7254  if ( __kmp_root[ f ] != NULL ) {
7255  __kmp_free( __kmp_root[ f ] );
7256  __kmp_root[ f ] = NULL;
7257  }
7258  }
7259  __kmp_free( __kmp_threads );
7260  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7261  // freeing __kmp_root.
7262  __kmp_threads = NULL;
7263  __kmp_root = NULL;
7264  __kmp_threads_capacity = 0;
7265 
7266 #if KMP_USE_DYNAMIC_LOCK
7267  __kmp_cleanup_indirect_user_locks();
7268 #else
7269  __kmp_cleanup_user_locks();
7270 #endif
7271 
7272  #if KMP_AFFINITY_SUPPORTED
7273  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7274  __kmp_cpuinfo_file = NULL;
7275  #endif /* KMP_AFFINITY_SUPPORTED */
7276 
7277  #if KMP_USE_ADAPTIVE_LOCKS
7278  #if KMP_DEBUG_ADAPTIVE_LOCKS
7279  __kmp_print_speculative_stats();
7280  #endif
7281  #endif
7282  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7283  __kmp_nested_nth.nth = NULL;
7284  __kmp_nested_nth.size = 0;
7285  __kmp_nested_nth.used = 0;
7286 
7287  __kmp_i18n_catclose();
7288 
7289 #if KMP_STATS_ENABLED
7290  __kmp_accumulate_stats_at_exit();
7291  __kmp_stats_list.deallocate();
7292 #endif
7293 
7294  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7295 }
7296 
7297 /* ------------------------------------------------------------------------ */
7298 /* ------------------------------------------------------------------------ */
7299 
7300 int
7301 __kmp_ignore_mppbeg( void )
7302 {
7303  char *env;
7304 
7305  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7306  if (__kmp_str_match_false( env ))
7307  return FALSE;
7308  }
7309  // By default __kmpc_begin() is no-op.
7310  return TRUE;
7311 }
7312 
7313 int
7314 __kmp_ignore_mppend( void )
7315 {
7316  char *env;
7317 
7318  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7319  if (__kmp_str_match_false( env ))
7320  return FALSE;
7321  }
7322  // By default __kmpc_end() is no-op.
7323  return TRUE;
7324 }
7325 
7326 void
7327 __kmp_internal_begin( void )
7328 {
7329  int gtid;
7330  kmp_root_t *root;
7331 
7332  /* this is a very important step as it will register new sibling threads
7333  * and assign these new uber threads a new gtid */
7334  gtid = __kmp_entry_gtid();
7335  root = __kmp_threads[ gtid ]->th.th_root;
7336  KMP_ASSERT( KMP_UBER_GTID( gtid ));
7337 
7338  if( root->r.r_begin ) return;
7339  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7340  if( root->r.r_begin ) {
7341  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7342  return;
7343  }
7344 
7345  root->r.r_begin = TRUE;
7346 
7347  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7348 }
7349 
7350 
7351 /* ------------------------------------------------------------------------ */
7352 /* ------------------------------------------------------------------------ */
7353 
7354 void
7355 __kmp_user_set_library (enum library_type arg)
7356 {
7357  int gtid;
7358  kmp_root_t *root;
7359  kmp_info_t *thread;
7360 
7361  /* first, make sure we are initialized so we can get our gtid */
7362 
7363  gtid = __kmp_entry_gtid();
7364  thread = __kmp_threads[ gtid ];
7365 
7366  root = thread->th.th_root;
7367 
7368  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7369  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7370  KMP_WARNING( SetLibraryIncorrectCall );
7371  return;
7372  }
7373 
7374  switch ( arg ) {
7375  case library_serial :
7376  thread->th.th_set_nproc = 0;
7377  set__nproc( thread, 1 );
7378  break;
7379  case library_turnaround :
7380  thread->th.th_set_nproc = 0;
7381  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7382  break;
7383  case library_throughput :
7384  thread->th.th_set_nproc = 0;
7385  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7386  break;
7387  default:
7388  KMP_FATAL( UnknownLibraryType, arg );
7389  }
7390 
7391  __kmp_aux_set_library ( arg );
7392 }
7393 
7394 void
7395 __kmp_aux_set_stacksize( size_t arg )
7396 {
7397  if (! __kmp_init_serial)
7398  __kmp_serial_initialize();
7399 
7400 #if KMP_OS_DARWIN
7401  if (arg & (0x1000 - 1)) {
7402  arg &= ~(0x1000 - 1);
7403  if(arg + 0x1000) /* check for overflow if we round up */
7404  arg += 0x1000;
7405  }
7406 #endif
7407  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7408 
7409  /* only change the default stacksize before the first parallel region */
7410  if (! TCR_4(__kmp_init_parallel)) {
7411  size_t value = arg; /* argument is in bytes */
7412 
7413  if (value < __kmp_sys_min_stksize )
7414  value = __kmp_sys_min_stksize ;
7415  else if (value > KMP_MAX_STKSIZE)
7416  value = KMP_MAX_STKSIZE;
7417 
7418  __kmp_stksize = value;
7419 
7420  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7421  }
7422 
7423  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7424 }
7425 
7426 /* set the behaviour of the runtime library */
7427 /* TODO this can cause some odd behaviour with sibling parallelism... */
7428 void
7429 __kmp_aux_set_library (enum library_type arg)
7430 {
7431  __kmp_library = arg;
7432 
7433  switch ( __kmp_library ) {
7434  case library_serial :
7435  {
7436  KMP_INFORM( LibraryIsSerial );
7437  (void) __kmp_change_library( TRUE );
7438  }
7439  break;
7440  case library_turnaround :
7441  (void) __kmp_change_library( TRUE );
7442  break;
7443  case library_throughput :
7444  (void) __kmp_change_library( FALSE );
7445  break;
7446  default:
7447  KMP_FATAL( UnknownLibraryType, arg );
7448  }
7449 }
7450 
7451 /* ------------------------------------------------------------------------ */
7452 /* ------------------------------------------------------------------------ */
7453 
7454 void
7455 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7456 {
7457  int blocktime = arg; /* argument is in milliseconds */
7458  int bt_intervals;
7459  int bt_set;
7460 
7461  __kmp_save_internal_controls( thread );
7462 
7463  /* Normalize and set blocktime for the teams */
7464  if (blocktime < KMP_MIN_BLOCKTIME)
7465  blocktime = KMP_MIN_BLOCKTIME;
7466  else if (blocktime > KMP_MAX_BLOCKTIME)
7467  blocktime = KMP_MAX_BLOCKTIME;
7468 
7469  set__blocktime_team( thread->th.th_team, tid, blocktime );
7470  set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7471 
7472  /* Calculate and set blocktime intervals for the teams */
7473  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7474 
7475  set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7476  set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7477 
7478  /* Set whether blocktime has been set to "TRUE" */
7479  bt_set = TRUE;
7480 
7481  set__bt_set_team( thread->th.th_team, tid, bt_set );
7482  set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7483  KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
7484  __kmp_gtid_from_tid(tid, thread->th.th_team),
7485  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
7486 }
7487 
7488 void
7489 __kmp_aux_set_defaults(
7490  char const * str,
7491  int len
7492 ) {
7493  if ( ! __kmp_init_serial ) {
7494  __kmp_serial_initialize();
7495  };
7496  __kmp_env_initialize( str );
7497 
7498  if (__kmp_settings
7499 #if OMP_40_ENABLED
7500  || __kmp_display_env || __kmp_display_env_verbose
7501 #endif // OMP_40_ENABLED
7502  ) {
7503  __kmp_env_print();
7504  }
7505 } // __kmp_aux_set_defaults
7506 
7507 /* ------------------------------------------------------------------------ */
7508 
7509 /*
7510  * internal fast reduction routines
7511  */
7512 
7513 PACKED_REDUCTION_METHOD_T
7514 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7515  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7516  kmp_critical_name *lck )
7517 {
7518 
7519  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7520  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7521  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7522  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7523 
7524  PACKED_REDUCTION_METHOD_T retval;
7525 
7526  int team_size;
7527 
7528  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
7529  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
7530 
7531  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7532  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
7533 
7534  retval = critical_reduce_block;
7535 
7536  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7537 
7538  if( team_size == 1 ) {
7539 
7540  retval = empty_reduce_block;
7541 
7542  } else {
7543 
7544  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7545  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7546 
7547  #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7548 
7549  #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7550 
7551  int teamsize_cutoff = 4;
7552 
7553 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7554  if( __kmp_mic_type != non_mic ) {
7555  teamsize_cutoff = 8;
7556  }
7557 #endif
7558  if( tree_available ) {
7559  if( team_size <= teamsize_cutoff ) {
7560  if ( atomic_available ) {
7561  retval = atomic_reduce_block;
7562  }
7563  } else {
7564  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7565  }
7566  } else if ( atomic_available ) {
7567  retval = atomic_reduce_block;
7568  }
7569  #else
7570  #error "Unknown or unsupported OS"
7571  #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7572 
7573  #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7574 
7575  #if KMP_OS_LINUX || KMP_OS_WINDOWS
7576 
7577  // basic tuning
7578 
7579  if( atomic_available ) {
7580  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7581  retval = atomic_reduce_block;
7582  }
7583  } // otherwise: use critical section
7584 
7585  #elif KMP_OS_DARWIN
7586 
7587  if( atomic_available && ( num_vars <= 3 ) ) {
7588  retval = atomic_reduce_block;
7589  } else if( tree_available ) {
7590  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7591  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7592  }
7593  } // otherwise: use critical section
7594 
7595  #else
7596  #error "Unknown or unsupported OS"
7597  #endif
7598 
7599  #else
7600  #error "Unknown or unsupported architecture"
7601  #endif
7602 
7603  }
7604 
7605  // KMP_FORCE_REDUCTION
7606 
7607  // If the team is serialized (team_size == 1), ignore the forced reduction
7608  // method and stay with the unsynchronized method (empty_reduce_block)
7609  if( __kmp_force_reduction_method != reduction_method_not_defined && team_size != 1) {
7610 
7611  PACKED_REDUCTION_METHOD_T forced_retval;
7612 
7613  int atomic_available, tree_available;
7614 
7615  switch( ( forced_retval = __kmp_force_reduction_method ) )
7616  {
7617  case critical_reduce_block:
7618  KMP_ASSERT( lck ); // lck should be != 0
7619  break;
7620 
7621  case atomic_reduce_block:
7622  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7623  KMP_ASSERT( atomic_available ); // atomic_available should be != 0
7624  break;
7625 
7626  case tree_reduce_block:
7627  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7628  KMP_ASSERT( tree_available ); // tree_available should be != 0
7629  #if KMP_FAST_REDUCTION_BARRIER
7630  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7631  #endif
7632  break;
7633 
7634  default:
7635  KMP_ASSERT( 0 ); // "unsupported method specified"
7636  }
7637 
7638  retval = forced_retval;
7639  }
7640 
7641  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7642 
7643  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7644  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7645 
7646  return ( retval );
7647 }
7648 
7649 // this function is for testing set/get/determine reduce method
7650 kmp_int32
7651 __kmp_get_reduce_method( void ) {
7652  return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7653 }
7654 
7655 /* ------------------------------------------------------------------------ */
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:632
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:645
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:489
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:183
Definition: kmp.h:200
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:474
sched_type
Definition: kmp.h:302
kmp_int32 flags
Definition: kmp.h:202