LLVM OpenMP* Runtime Library
kmp_runtime.c
1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_atomic.h"
18 #include "kmp_wrapper_getpid.h"
19 #include "kmp_environment.h"
20 #include "kmp_itt.h"
21 #include "kmp_str.h"
22 #include "kmp_settings.h"
23 #include "kmp_i18n.h"
24 #include "kmp_io.h"
25 #include "kmp_error.h"
26 #include "kmp_stats.h"
27 #include "kmp_wait_release.h"
28 
29 #if OMPT_SUPPORT
30 #include "ompt-specific.h"
31 #endif
32 
33 /* these are temporary issues to be dealt with */
34 #define KMP_USE_PRCTL 0
35 
36 #if KMP_OS_WINDOWS
37 #include <process.h>
38 #endif
39 
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
43 #endif /* defined(KMP_GOMP_COMPAT) */
44 
45 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
46 #if OMP_40_ENABLED
47  "4.0 (201307)";
48 #else
49  "3.1 (201107)";
50 #endif
51 
52 #ifdef KMP_DEBUG
53 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
54 #endif /* KMP_DEBUG */
55 
56 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
57 
58 /* ------------------------------------------------------------------------ */
59 /* ------------------------------------------------------------------------ */
60 
61 kmp_info_t __kmp_monitor;
62 
63 /* ------------------------------------------------------------------------ */
64 /* ------------------------------------------------------------------------ */
65 
66 /* Forward declarations */
67 
68 void __kmp_cleanup( void );
69 
70 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
71 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
72 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
73 static void __kmp_partition_places( kmp_team_t *team, int update_master_only=0 );
74 #endif
75 static void __kmp_do_serial_initialize( void );
76 void __kmp_fork_barrier( int gtid, int tid );
77 void __kmp_join_barrier( int gtid );
78 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
79 
80 #ifdef USE_LOAD_BALANCE
81 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
82 #endif
83 
84 static int __kmp_expand_threads(int nWish, int nNeed);
85 #if KMP_OS_WINDOWS
86 static int __kmp_unregister_root_other_thread( int gtid );
87 #endif
88 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
89 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
90 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
91 
92 /* ------------------------------------------------------------------------ */
93 /* ------------------------------------------------------------------------ */
94 
95 /* Calculate the identifier of the current thread */
96 /* fast (and somewhat portable) way to get unique */
97 /* identifier of executing thread. */
98 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
99 
100 int
101 __kmp_get_global_thread_id( )
102 {
103  int i;
104  kmp_info_t **other_threads;
105  size_t stack_data;
106  char *stack_addr;
107  size_t stack_size;
108  char *stack_base;
109 
110  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
111  __kmp_nth, __kmp_all_nth ));
112 
113  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
114  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
115  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
116  __kmp_init_gtid for this to work. */
117 
118  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
119 
120 #ifdef KMP_TDATA_GTID
121  if ( TCR_4(__kmp_gtid_mode) >= 3) {
122  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
123  return __kmp_gtid;
124  }
125 #endif
126  if ( TCR_4(__kmp_gtid_mode) >= 2) {
127  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
128  return __kmp_gtid_get_specific();
129  }
130  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
131 
132  stack_addr = (char*) & stack_data;
133  other_threads = __kmp_threads;
134 
135  /*
136  ATT: The code below is a source of potential bugs due to unsynchronized access to
137  __kmp_threads array. For example:
138  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
139  2. Current thread is suspended by OS.
140  3. Another thread unregisters and finishes (debug versions of free() may fill memory
141  with something like 0xEF).
142  4. Current thread is resumed.
143  5. Current thread reads junk from *thr.
144  TODO: Fix it.
145  --ln
146  */
147 
148  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
149 
150  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
151  if( !thr ) continue;
152 
153  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
154  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
155 
156  /* stack grows down -- search through all of the active threads */
157 
158  if( stack_addr <= stack_base ) {
159  size_t stack_diff = stack_base - stack_addr;
160 
161  if( stack_diff <= stack_size ) {
162  /* The only way we can be closer than the allocated */
163  /* stack size is if we are running on this thread. */
164  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
165  return i;
166  }
167  }
168  }
169 
170  /* get specific to try and determine our gtid */
171  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
172  "thread, using TLS\n" ));
173  i = __kmp_gtid_get_specific();
174 
175  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
176 
177  /* if we havn't been assigned a gtid, then return code */
178  if( i<0 ) return i;
179 
180  /* dynamically updated stack window for uber threads to avoid get_specific call */
181  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
182  KMP_FATAL( StackOverflow, i );
183  }
184 
185  stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
186  if( stack_addr > stack_base ) {
187  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
188  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
189  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
190  } else {
191  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
192  }
193 
194  /* Reprint stack bounds for ubermaster since they have been refined */
195  if ( __kmp_storage_map ) {
196  char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
197  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
198  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
199  other_threads[i]->th.th_info.ds.ds_stacksize,
200  "th_%d stack (refinement)", i );
201  }
202  return i;
203 }
204 
205 int
206 __kmp_get_global_thread_id_reg( )
207 {
208  int gtid;
209 
210  if ( !__kmp_init_serial ) {
211  gtid = KMP_GTID_DNE;
212  } else
213 #ifdef KMP_TDATA_GTID
214  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
215  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
216  gtid = __kmp_gtid;
217  } else
218 #endif
219  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
220  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
221  gtid = __kmp_gtid_get_specific();
222  } else {
223  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
224  gtid = __kmp_get_global_thread_id();
225  }
226 
227  /* we must be a new uber master sibling thread */
228  if( gtid == KMP_GTID_DNE ) {
229  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
230  "Registering a new gtid.\n" ));
231  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
232  if( !__kmp_init_serial ) {
233  __kmp_do_serial_initialize();
234  gtid = __kmp_gtid_get_specific();
235  } else {
236  gtid = __kmp_register_root(FALSE);
237  }
238  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
239  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
240  }
241 
242  KMP_DEBUG_ASSERT( gtid >=0 );
243 
244  return gtid;
245 }
246 
247 /* caller must hold forkjoin_lock */
248 void
249 __kmp_check_stack_overlap( kmp_info_t *th )
250 {
251  int f;
252  char *stack_beg = NULL;
253  char *stack_end = NULL;
254  int gtid;
255 
256  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
257  if ( __kmp_storage_map ) {
258  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
259  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
260 
261  gtid = __kmp_gtid_from_thread( th );
262 
263  if (gtid == KMP_GTID_MONITOR) {
264  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
265  "th_%s stack (%s)", "mon",
266  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
267  } else {
268  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
269  "th_%d stack (%s)", gtid,
270  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
271  }
272  }
273 
274  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
275  gtid = __kmp_gtid_from_thread( th );
276  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
277  {
278  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
279  if ( stack_beg == NULL ) {
280  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
281  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
282  }
283 
284  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
285  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
286 
287  if( f_th && f_th != th ) {
288  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
289  char *other_stack_beg = other_stack_end -
290  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
291  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
292  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
293 
294  /* Print the other stack values before the abort */
295  if ( __kmp_storage_map )
296  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
297  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
298  "th_%d stack (overlapped)",
299  __kmp_gtid_from_thread( f_th ) );
300 
301  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
302  }
303  }
304  }
305  }
306  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
307 }
308 
309 
310 /* ------------------------------------------------------------------------ */
311 
312 /* ------------------------------------------------------------------------ */
313 
314 void
315 __kmp_infinite_loop( void )
316 {
317  static int done = FALSE;
318 
319  while (! done) {
320  KMP_YIELD( 1 );
321  }
322 }
323 
324 #define MAX_MESSAGE 512
325 
326 void
327 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
328  char buffer[MAX_MESSAGE];
329  va_list ap;
330 
331  va_start( ap, format);
332  KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
333  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
334  __kmp_vprintf( kmp_err, buffer, ap );
335 #if KMP_PRINT_DATA_PLACEMENT
336  int node;
337  if(gtid >= 0) {
338  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
339  if( __kmp_storage_map_verbose ) {
340  node = __kmp_get_host_node(p1);
341  if(node < 0) /* doesn't work, so don't try this next time */
342  __kmp_storage_map_verbose = FALSE;
343  else {
344  char *last;
345  int lastNode;
346  int localProc = __kmp_get_cpu_from_gtid(gtid);
347 
348  p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
349  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
350  if(localProc >= 0)
351  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
352  else
353  __kmp_printf_no_lock(" GTID %d\n", gtid);
354 # if KMP_USE_PRCTL
355 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
356  do {
357  last = p1;
358  lastNode = node;
359  /* This loop collates adjacent pages with the same host node. */
360  do {
361  (char*)p1 += PAGE_SIZE;
362  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
363  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
364  (char*)p1 - 1, lastNode);
365  } while(p1 <= p2);
366 # else
367  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
368  (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
369  if(p1 < p2) {
370  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
371  (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
372  }
373 # endif
374  }
375  }
376  } else
377  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
378  }
379 #endif /* KMP_PRINT_DATA_PLACEMENT */
380  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
381 }
382 
383 void
384 __kmp_warn( char const * format, ... )
385 {
386  char buffer[MAX_MESSAGE];
387  va_list ap;
388 
389  if ( __kmp_generate_warnings == kmp_warnings_off ) {
390  return;
391  }
392 
393  va_start( ap, format );
394 
395  KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
396  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
397  __kmp_vprintf( kmp_err, buffer, ap );
398  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
399 
400  va_end( ap );
401 }
402 
403 void
404 __kmp_abort_process()
405 {
406 
407  // Later threads may stall here, but that's ok because abort() will kill them.
408  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
409 
410  if ( __kmp_debug_buf ) {
411  __kmp_dump_debug_buffer();
412  }; // if
413 
414  if ( KMP_OS_WINDOWS ) {
415  // Let other threads know of abnormal termination and prevent deadlock
416  // if abort happened during library initialization or shutdown
417  __kmp_global.g.g_abort = SIGABRT;
418 
419  /*
420  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
421  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
422  works well, but this function is not available in VS7 (this is not problem for DLL, but
423  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
424  not help, at least in some versions of MS C RTL.
425 
426  It seems following sequence is the only way to simulate abort() and avoid pop-up error
427  box.
428  */
429  raise( SIGABRT );
430  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
431  } else {
432  abort();
433  }; // if
434 
435  __kmp_infinite_loop();
436  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
437 
438 } // __kmp_abort_process
439 
440 void
441 __kmp_abort_thread( void )
442 {
443  // TODO: Eliminate g_abort global variable and this function.
444  // In case of abort just call abort(), it will kill all the threads.
445  __kmp_infinite_loop();
446 } // __kmp_abort_thread
447 
448 /* ------------------------------------------------------------------------ */
449 
450 /*
451  * Print out the storage map for the major kmp_info_t thread data structures
452  * that are allocated together.
453  */
454 
455 static void
456 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
457 {
458  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
459 
460  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
461  "th_%d.th_info", gtid );
462 
463  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
464  "th_%d.th_local", gtid );
465 
466  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
468 
469  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
470  &thr->th.th_bar[bs_plain_barrier+1],
471  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
472 
473  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
474  &thr->th.th_bar[bs_forkjoin_barrier+1],
475  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
476 
477  #if KMP_FAST_REDUCTION_BARRIER
478  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
479  &thr->th.th_bar[bs_reduction_barrier+1],
480  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
481  #endif // KMP_FAST_REDUCTION_BARRIER
482 }
483 
484 /*
485  * Print out the storage map for the major kmp_team_t team data structures
486  * that are allocated together.
487  */
488 
489 static void
490 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
491 {
492  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494  header, team_id );
495 
496  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
497  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
498 
499 
500  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
501  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
502 
503  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
504  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
505 
506  #if KMP_FAST_REDUCTION_BARRIER
507  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
508  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
509  #endif // KMP_FAST_REDUCTION_BARRIER
510 
511  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
512  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
513 
514  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
515  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
516 
517  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
518  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
519  header, team_id );
520 
521 
522  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
523  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
524 }
525 
526 static void __kmp_init_allocator() {}
527 static void __kmp_fini_allocator() {}
528 
529 /* ------------------------------------------------------------------------ */
530 
531 #ifdef KMP_DYNAMIC_LIB
532 # if KMP_OS_WINDOWS
533 
534 static void
535 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
536  // TODO: Change to __kmp_break_bootstrap_lock().
537  __kmp_init_bootstrap_lock( lck ); // make the lock released
538 }
539 
540 static void
541 __kmp_reset_locks_on_process_detach( int gtid_req ) {
542  int i;
543  int thread_count;
544 
545  // PROCESS_DETACH is expected to be called by a thread
546  // that executes ProcessExit() or FreeLibrary().
547  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
548  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
549  // However, in fact, some threads can be still alive here, although being about to be terminated.
550  // The threads in the array with ds_thread==0 are most suspicious.
551  // Actually, it can be not safe to access the __kmp_threads[].
552 
553  // TODO: does it make sense to check __kmp_roots[] ?
554 
555  // Let's check that there are no other alive threads registered with the OMP lib.
556  while( 1 ) {
557  thread_count = 0;
558  for( i = 0; i < __kmp_threads_capacity; ++i ) {
559  if( !__kmp_threads ) continue;
560  kmp_info_t* th = __kmp_threads[ i ];
561  if( th == NULL ) continue;
562  int gtid = th->th.th_info.ds.ds_gtid;
563  if( gtid == gtid_req ) continue;
564  if( gtid < 0 ) continue;
565  DWORD exit_val;
566  int alive = __kmp_is_thread_alive( th, &exit_val );
567  if( alive ) {
568  ++thread_count;
569  }
570  }
571  if( thread_count == 0 ) break; // success
572  }
573 
574  // Assume that I'm alone.
575 
576  // Now it might be probably safe to check and reset locks.
577  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
578  __kmp_reset_lock( &__kmp_forkjoin_lock );
579  #ifdef KMP_DEBUG
580  __kmp_reset_lock( &__kmp_stdio_lock );
581  #endif // KMP_DEBUG
582 }
583 
584 BOOL WINAPI
585 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
586  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
587 
588  switch( fdwReason ) {
589 
590  case DLL_PROCESS_ATTACH:
591  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
592 
593  return TRUE;
594 
595  case DLL_PROCESS_DETACH:
596  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
597  __kmp_gtid_get_specific() ));
598 
599  if( lpReserved != NULL )
600  {
601  // lpReserved is used for telling the difference:
602  // lpReserved == NULL when FreeLibrary() was called,
603  // lpReserved != NULL when the process terminates.
604  // When FreeLibrary() is called, worker threads remain alive.
605  // So they will release the forkjoin lock by themselves.
606  // When the process terminates, worker threads disappear triggering
607  // the problem of unreleased forkjoin lock as described below.
608 
609  // A worker thread can take the forkjoin lock.
610  // The problem comes up if that worker thread becomes dead
611  // before it releases the forkjoin lock.
612  // The forkjoin lock remains taken, while the thread
613  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
614  // will try to take the forkjoin lock and will always fail,
615  // so that the application will never finish [normally].
616  // This scenario is possible if __kmpc_end() has not been executed.
617  // It looks like it's not a corner case, but common cases:
618  // - the main function was compiled by an alternative compiler;
619  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
620  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
621  // - alive foreign thread prevented __kmpc_end from doing cleanup.
622 
623  // This is a hack to work around the problem.
624  // TODO: !!! to figure out something better.
625  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
626  }
627 
628  __kmp_internal_end_library( __kmp_gtid_get_specific() );
629 
630  return TRUE;
631 
632  case DLL_THREAD_ATTACH:
633  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
634 
635  /* if we wanted to register new siblings all the time here call
636  * __kmp_get_gtid(); */
637  return TRUE;
638 
639  case DLL_THREAD_DETACH:
640  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
641  __kmp_gtid_get_specific() ));
642 
643  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
644  return TRUE;
645  }
646 
647  return TRUE;
648 }
649 
650 # endif /* KMP_OS_WINDOWS */
651 #endif /* KMP_DYNAMIC_LIB */
652 
653 
654 /* ------------------------------------------------------------------------ */
655 
656 /* Change the library type to "status" and return the old type */
657 /* called from within initialization routines where __kmp_initz_lock is held */
658 int
659 __kmp_change_library( int status )
660 {
661  int old_status;
662 
663  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
664 
665  if (status) {
666  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
667  }
668  else {
669  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
670  }
671 
672  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
673 }
674 
675 /* ------------------------------------------------------------------------ */
676 /* ------------------------------------------------------------------------ */
677 
678 /* __kmp_parallel_deo --
679  * Wait until it's our turn.
680  */
681 void
682 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
683 {
684  int gtid = *gtid_ref;
685 #ifdef BUILD_PARALLEL_ORDERED
686  kmp_team_t *team = __kmp_team_from_gtid( gtid );
687 #endif /* BUILD_PARALLEL_ORDERED */
688 
689  if( __kmp_env_consistency_check ) {
690  if( __kmp_threads[gtid]->th.th_root->r.r_active )
691 #if KMP_USE_DYNAMIC_LOCK
692  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
693 #else
694  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
695 #endif
696  }
697 #ifdef BUILD_PARALLEL_ORDERED
698  if( !team->t.t_serialized ) {
699  KMP_MB();
700  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
701  KMP_MB();
702  }
703 #endif /* BUILD_PARALLEL_ORDERED */
704 }
705 
706 /* __kmp_parallel_dxo --
707  * Signal the next task.
708  */
709 
710 void
711 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
712 {
713  int gtid = *gtid_ref;
714 #ifdef BUILD_PARALLEL_ORDERED
715  int tid = __kmp_tid_from_gtid( gtid );
716  kmp_team_t *team = __kmp_team_from_gtid( gtid );
717 #endif /* BUILD_PARALLEL_ORDERED */
718 
719  if( __kmp_env_consistency_check ) {
720  if( __kmp_threads[gtid]->th.th_root->r.r_active )
721  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
722  }
723 #ifdef BUILD_PARALLEL_ORDERED
724  if ( ! team->t.t_serialized ) {
725  KMP_MB(); /* Flush all pending memory write invalidates. */
726 
727  /* use the tid of the next thread in this team */
728  /* TODO repleace with general release procedure */
729  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
730 
731 #if OMPT_SUPPORT && OMPT_BLAME
732  if (ompt_enabled &&
733  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
734  /* accept blame for "ordered" waiting */
735  kmp_info_t *this_thread = __kmp_threads[gtid];
736  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
737  this_thread->th.ompt_thread_info.wait_id);
738  }
739 #endif
740 
741  KMP_MB(); /* Flush all pending memory write invalidates. */
742  }
743 #endif /* BUILD_PARALLEL_ORDERED */
744 }
745 
746 /* ------------------------------------------------------------------------ */
747 /* ------------------------------------------------------------------------ */
748 
749 /* ------------------------------------------------------------------------ */
750 /* ------------------------------------------------------------------------ */
751 
752 /* The BARRIER for a SINGLE process section is always explicit */
753 
754 int
755 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
756 {
757  int status;
758  kmp_info_t *th;
759  kmp_team_t *team;
760 
761  if( ! TCR_4(__kmp_init_parallel) )
762  __kmp_parallel_initialize();
763 
764  th = __kmp_threads[ gtid ];
765  team = th->th.th_team;
766  status = 0;
767 
768  th->th.th_ident = id_ref;
769 
770  if ( team->t.t_serialized ) {
771  status = 1;
772  } else {
773  kmp_int32 old_this = th->th.th_local.this_construct;
774 
775  ++th->th.th_local.this_construct;
776  /* try to set team count to thread count--success means thread got the
777  single block
778  */
779  /* TODO: Should this be acquire or release? */
780  if (team->t.t_construct == old_this) {
781  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
782  th->th.th_local.this_construct);
783  }
784 #if USE_ITT_BUILD
785  if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
786 #if OMP_40_ENABLED
787  th->th.th_teams_microtask == NULL &&
788 #endif
789  team->t.t_active_level == 1 )
790  { // Only report metadata by master of active team at level 1
791  __kmp_itt_metadata_single( id_ref );
792  }
793 #endif /* USE_ITT_BUILD */
794  }
795 
796  if( __kmp_env_consistency_check ) {
797  if (status && push_ws) {
798  __kmp_push_workshare( gtid, ct_psingle, id_ref );
799  } else {
800  __kmp_check_workshare( gtid, ct_psingle, id_ref );
801  }
802  }
803 #if USE_ITT_BUILD
804  if ( status ) {
805  __kmp_itt_single_start( gtid );
806  }
807 #endif /* USE_ITT_BUILD */
808  return status;
809 }
810 
811 void
812 __kmp_exit_single( int gtid )
813 {
814 #if USE_ITT_BUILD
815  __kmp_itt_single_end( gtid );
816 #endif /* USE_ITT_BUILD */
817  if( __kmp_env_consistency_check )
818  __kmp_pop_workshare( gtid, ct_psingle, NULL );
819 }
820 
821 
822 /*
823  * determine if we can go parallel or must use a serialized parallel region and
824  * how many threads we can use
825  * set_nproc is the number of threads requested for the team
826  * returns 0 if we should serialize or only use one thread,
827  * otherwise the number of threads to use
828  * The forkjoin lock is held by the caller.
829  */
830 static int
831 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
832  int master_tid, int set_nthreads
833 #if OMP_40_ENABLED
834  , int enter_teams
835 #endif /* OMP_40_ENABLED */
836 )
837 {
838  int capacity;
839  int new_nthreads;
840  KMP_DEBUG_ASSERT( __kmp_init_serial );
841  KMP_DEBUG_ASSERT( root && parent_team );
842 
843  //
844  // If dyn-var is set, dynamically adjust the number of desired threads,
845  // according to the method specified by dynamic_mode.
846  //
847  new_nthreads = set_nthreads;
848  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
849  ;
850  }
851 #ifdef USE_LOAD_BALANCE
852  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
853  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
854  if ( new_nthreads == 1 ) {
855  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
856  master_tid ));
857  return 1;
858  }
859  if ( new_nthreads < set_nthreads ) {
860  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
861  master_tid, new_nthreads ));
862  }
863  }
864 #endif /* USE_LOAD_BALANCE */
865  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
866  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
867  : root->r.r_hot_team->t.t_nproc);
868  if ( new_nthreads <= 1 ) {
869  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
870  master_tid ));
871  return 1;
872  }
873  if ( new_nthreads < set_nthreads ) {
874  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
875  master_tid, new_nthreads ));
876  }
877  else {
878  new_nthreads = set_nthreads;
879  }
880  }
881  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
882  if ( set_nthreads > 2 ) {
883  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
884  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
885  if ( new_nthreads == 1 ) {
886  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
887  master_tid ));
888  return 1;
889  }
890  if ( new_nthreads < set_nthreads ) {
891  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
892  master_tid, new_nthreads ));
893  }
894  }
895  }
896  else {
897  KMP_ASSERT( 0 );
898  }
899 
900  //
901  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
902  //
903  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
904  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
905  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
906  root->r.r_hot_team->t.t_nproc );
907  if ( tl_nthreads <= 0 ) {
908  tl_nthreads = 1;
909  }
910 
911  //
912  // If dyn-var is false, emit a 1-time warning.
913  //
914  if ( ! get__dynamic_2( parent_team, master_tid )
915  && ( ! __kmp_reserve_warn ) ) {
916  __kmp_reserve_warn = 1;
917  __kmp_msg(
918  kmp_ms_warning,
919  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
920  KMP_HNT( Unset_ALL_THREADS ),
921  __kmp_msg_null
922  );
923  }
924  if ( tl_nthreads == 1 ) {
925  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
926  master_tid ));
927  return 1;
928  }
929  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
930  master_tid, tl_nthreads ));
931  new_nthreads = tl_nthreads;
932  }
933 
934  //
935  // Check if the threads array is large enough, or needs expanding.
936  //
937  // See comment in __kmp_register_root() about the adjustment if
938  // __kmp_threads[0] == NULL.
939  //
940  capacity = __kmp_threads_capacity;
941  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
942  --capacity;
943  }
944  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
945  root->r.r_hot_team->t.t_nproc ) > capacity ) {
946  //
947  // Expand the threads array.
948  //
949  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
950  root->r.r_hot_team->t.t_nproc ) - capacity;
951  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
952  if ( slotsAdded < slotsRequired ) {
953  //
954  // The threads array was not expanded enough.
955  //
956  new_nthreads -= ( slotsRequired - slotsAdded );
957  KMP_ASSERT( new_nthreads >= 1 );
958 
959  //
960  // If dyn-var is false, emit a 1-time warning.
961  //
962  if ( ! get__dynamic_2( parent_team, master_tid )
963  && ( ! __kmp_reserve_warn ) ) {
964  __kmp_reserve_warn = 1;
965  if ( __kmp_tp_cached ) {
966  __kmp_msg(
967  kmp_ms_warning,
968  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
969  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
970  KMP_HNT( PossibleSystemLimitOnThreads ),
971  __kmp_msg_null
972  );
973  }
974  else {
975  __kmp_msg(
976  kmp_ms_warning,
977  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
978  KMP_HNT( SystemLimitOnThreads ),
979  __kmp_msg_null
980  );
981  }
982  }
983  }
984  }
985 
986  if ( new_nthreads == 1 ) {
987  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
988  __kmp_get_gtid(), set_nthreads ) );
989  return 1;
990  }
991 
992  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
993  __kmp_get_gtid(), new_nthreads, set_nthreads ));
994  return new_nthreads;
995 }
996 
997 /* ------------------------------------------------------------------------ */
998 /* ------------------------------------------------------------------------ */
999 
1000 /* allocate threads from the thread pool and assign them to the new team */
1001 /* we are assured that there are enough threads available, because we
1002  * checked on that earlier within critical section forkjoin */
1003 
1004 static void
1005 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1006  kmp_info_t *master_th, int master_gtid )
1007 {
1008  int i;
1009  int use_hot_team;
1010 
1011  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1012  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1013  KMP_MB();
1014 
1015  /* first, let's setup the master thread */
1016  master_th->th.th_info.ds.ds_tid = 0;
1017  master_th->th.th_team = team;
1018  master_th->th.th_team_nproc = team->t.t_nproc;
1019  master_th->th.th_team_master = master_th;
1020  master_th->th.th_team_serialized = FALSE;
1021  master_th->th.th_dispatch = & team->t.t_dispatch[ 0 ];
1022 
1023  /* make sure we are not the optimized hot team */
1024 #if KMP_NESTED_HOT_TEAMS
1025  use_hot_team = 0;
1026  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1027  if( hot_teams ) { // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1028  int level = team->t.t_active_level - 1; // index in array of hot teams
1029  if( master_th->th.th_teams_microtask ) { // are we inside the teams?
1030  if( master_th->th.th_teams_size.nteams > 1 ) {
1031  ++level; // level was not increased in teams construct for team_of_masters
1032  }
1033  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1034  master_th->th.th_teams_level == team->t.t_level ) {
1035  ++level; // level was not increased in teams construct for team_of_workers before the parallel
1036  } // team->t.t_level will be increased inside parallel
1037  }
1038  if( level < __kmp_hot_teams_max_level ) {
1039  if( hot_teams[level].hot_team ) {
1040  // hot team has already been allocated for given level
1041  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1042  use_hot_team = 1; // the team is ready to use
1043  } else {
1044  use_hot_team = 0; // AC: threads are not allocated yet
1045  hot_teams[level].hot_team = team; // remember new hot team
1046  hot_teams[level].hot_team_nth = team->t.t_nproc;
1047  }
1048  } else {
1049  use_hot_team = 0;
1050  }
1051  }
1052 #else
1053  use_hot_team = team == root->r.r_hot_team;
1054 #endif
1055  if ( !use_hot_team ) {
1056 
1057  /* install the master thread */
1058  team->t.t_threads[ 0 ] = master_th;
1059  __kmp_initialize_info( master_th, team, 0, master_gtid );
1060 
1061  /* now, install the worker threads */
1062  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
1063 
1064  /* fork or reallocate a new thread and install it in team */
1065  kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1066  team->t.t_threads[ i ] = thr;
1067  KMP_DEBUG_ASSERT( thr );
1068  KMP_DEBUG_ASSERT( thr->th.th_team == team );
1069  /* align team and thread arrived states */
1070  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%llu, plain=%llu\n",
1071  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1072  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1073  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1074  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1075 #if OMP_40_ENABLED
1076  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1077  thr->th.th_teams_level = master_th->th.th_teams_level;
1078  thr->th.th_teams_size = master_th->th.th_teams_size;
1079 #endif
1080  { // Initialize threads' barrier data.
1081  int b;
1082  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1083  for ( b = 0; b < bs_last_barrier; ++ b ) {
1084  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
1085  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1086 #if USE_DEBUGGER
1087  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1088 #endif
1089  }; // for b
1090  }
1091  }
1092 
1093 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1094  __kmp_partition_places( team );
1095 #endif
1096 
1097  }
1098 
1099  KMP_MB();
1100 }
1101 
1102 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1103 //
1104 // Propagate any changes to the floating point control registers out to the team
1105 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1106 // so we don't make changes unless they are needed.
1107 //
1108 inline static void
1109 propagateFPControl(kmp_team_t * team)
1110 {
1111  if ( __kmp_inherit_fp_control ) {
1112  kmp_int16 x87_fpu_control_word;
1113  kmp_uint32 mxcsr;
1114 
1115  // Get master values of FPU control flags (both X87 and vector)
1116  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1117  __kmp_store_mxcsr( &mxcsr );
1118  mxcsr &= KMP_X86_MXCSR_MASK;
1119 
1120  // There is no point looking at t_fp_control_saved here.
1121  // If it is TRUE, we still have to update the values if they are different from those we now have.
1122  // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1123  // that the values in the team are the same as those we have.
1124  // So, this code achieves what we need whether or not t_fp_control_saved is true.
1125  // By checking whether the value needs updating we avoid unnecessary writes that would put the
1126  // cache-line into a written state, causing all threads in the team to have to read it again.
1127  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1128  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1129  // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1130  // So we must ensure it is correct.
1131  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1132  }
1133  else {
1134  // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1135  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1136  }
1137 }
1138 
1139 // Do the opposite, setting the hardware registers to the updated values from the team.
1140 inline static void
1141 updateHWFPControl(kmp_team_t * team)
1142 {
1143  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1144  //
1145  // Only reset the fp control regs if they have been changed in the team.
1146  // the parallel region that we are exiting.
1147  //
1148  kmp_int16 x87_fpu_control_word;
1149  kmp_uint32 mxcsr;
1150  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1151  __kmp_store_mxcsr( &mxcsr );
1152  mxcsr &= KMP_X86_MXCSR_MASK;
1153 
1154  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1155  __kmp_clear_x87_fpu_status_word();
1156  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1157  }
1158 
1159  if ( team->t.t_mxcsr != mxcsr ) {
1160  __kmp_load_mxcsr( &team->t.t_mxcsr );
1161  }
1162  }
1163 }
1164 #else
1165 # define propagateFPControl(x) ((void)0)
1166 # define updateHWFPControl(x) ((void)0)
1167 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1168 
1169 static void
1170 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1171 
1172 /*
1173  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1174  */
1175 void
1176 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1177 {
1178  kmp_info_t *this_thr;
1179  kmp_team_t *serial_team;
1180 
1181  KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1182 
1183  /* Skip all this code for autopar serialized loops since it results in
1184  unacceptable overhead */
1185  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1186  return;
1187 
1188  if( ! TCR_4( __kmp_init_parallel ) )
1189  __kmp_parallel_initialize();
1190 
1191  this_thr = __kmp_threads[ global_tid ];
1192  serial_team = this_thr->th.th_serial_team;
1193 
1194  /* utilize the serialized team held by this thread */
1195  KMP_DEBUG_ASSERT( serial_team );
1196  KMP_MB();
1197 
1198  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1199  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1200  KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1201  KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1202  global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1203  this_thr->th.th_task_team = NULL;
1204  }
1205 
1206 #if OMP_40_ENABLED
1207  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1208  if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1209  proc_bind = proc_bind_false;
1210  }
1211  else if ( proc_bind == proc_bind_default ) {
1212  //
1213  // No proc_bind clause was specified, so use the current value
1214  // of proc-bind-var for this parallel region.
1215  //
1216  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1217  }
1218  //
1219  // Reset for next parallel region
1220  //
1221  this_thr->th.th_set_proc_bind = proc_bind_default;
1222 #endif /* OMP_40_ENABLED */
1223 
1224  if( this_thr->th.th_team != serial_team ) {
1225  // Nested level will be an index in the nested nthreads array
1226  int level = this_thr->th.th_team->t.t_level;
1227 
1228  if( serial_team->t.t_serialized ) {
1229  /* this serial team was already used
1230  * TODO increase performance by making this locks more specific */
1231  kmp_team_t *new_team;
1232 
1233  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1234 
1235 #if OMPT_SUPPORT
1236  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1237 #endif
1238 
1239  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1240 #if OMPT_SUPPORT
1241  ompt_parallel_id,
1242 #endif
1243 #if OMP_40_ENABLED
1244  proc_bind,
1245 #endif
1246  & this_thr->th.th_current_task->td_icvs,
1247  0 USE_NESTED_HOT_ARG(NULL) );
1248  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1249  KMP_ASSERT( new_team );
1250 
1251  /* setup new serialized team and install it */
1252  new_team->t.t_threads[0] = this_thr;
1253  new_team->t.t_parent = this_thr->th.th_team;
1254  serial_team = new_team;
1255  this_thr->th.th_serial_team = serial_team;
1256 
1257  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1258  global_tid, serial_team ) );
1259 
1260 
1261  /* TODO the above breaks the requirement that if we run out of
1262  * resources, then we can still guarantee that serialized teams
1263  * are ok, since we may need to allocate a new one */
1264  } else {
1265  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1266  global_tid, serial_team ) );
1267  }
1268 
1269  /* we have to initialize this serial team */
1270  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1271  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1272  KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1273  serial_team->t.t_ident = loc;
1274  serial_team->t.t_serialized = 1;
1275  serial_team->t.t_nproc = 1;
1276  serial_team->t.t_parent = this_thr->th.th_team;
1277  serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1278  this_thr->th.th_team = serial_team;
1279  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1280 
1281  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1282  global_tid, this_thr->th.th_current_task ) );
1283  KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1284  this_thr->th.th_current_task->td_flags.executing = 0;
1285 
1286  __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1287 
1288  /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
1289  each serialized task represented by team->t.t_serialized? */
1290  copy_icvs(
1291  & this_thr->th.th_current_task->td_icvs,
1292  & this_thr->th.th_current_task->td_parent->td_icvs );
1293 
1294  // Thread value exists in the nested nthreads array for the next nested level
1295  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1296  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1297  }
1298 
1299 #if OMP_40_ENABLED
1300  if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1301  this_thr->th.th_current_task->td_icvs.proc_bind
1302  = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1303  }
1304 #endif /* OMP_40_ENABLED */
1305 
1306 #if USE_DEBUGGER
1307  serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1308 #endif
1309  this_thr->th.th_info.ds.ds_tid = 0;
1310 
1311  /* set thread cache values */
1312  this_thr->th.th_team_nproc = 1;
1313  this_thr->th.th_team_master = this_thr;
1314  this_thr->th.th_team_serialized = 1;
1315 
1316  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1317  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1318 
1319  propagateFPControl (serial_team);
1320 
1321  /* check if we need to allocate dispatch buffers stack */
1322  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1323  if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1324  serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1325  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1326  }
1327  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1328 
1329 #if OMPT_SUPPORT
1330  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1331  __ompt_team_assign_id(serial_team, ompt_parallel_id);
1332 #endif
1333 
1334  KMP_MB();
1335 
1336  } else {
1337  /* this serialized team is already being used,
1338  * that's fine, just add another nested level */
1339  KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1340  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1341  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1342  ++ serial_team->t.t_serialized;
1343  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1344 
1345  // Nested level will be an index in the nested nthreads array
1346  int level = this_thr->th.th_team->t.t_level;
1347  // Thread value exists in the nested nthreads array for the next nested level
1348  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1349  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1350  }
1351  serial_team->t.t_level++;
1352  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1353  global_tid, serial_team, serial_team->t.t_level ) );
1354 
1355  /* allocate/push dispatch buffers stack */
1356  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1357  {
1358  dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1359  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1360  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1361  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1362  }
1363  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1364 
1365  KMP_MB();
1366  }
1367 
1368  if ( __kmp_env_consistency_check )
1369  __kmp_push_parallel( global_tid, NULL );
1370 
1371 }
1372 
1373 /* most of the work for a fork */
1374 /* return true if we really went parallel, false if serialized */
1375 int
1376 __kmp_fork_call(
1377  ident_t * loc,
1378  int gtid,
1379  enum fork_context_e call_context, // Intel, GNU, ...
1380  kmp_int32 argc,
1381 #if OMPT_SUPPORT
1382  void *unwrapped_task,
1383 #endif
1384  microtask_t microtask,
1385  launch_t invoker,
1386 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1387 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1388  va_list * ap
1389 #else
1390  va_list ap
1391 #endif
1392  )
1393 {
1394  void **argv;
1395  int i;
1396  int master_tid;
1397  int master_this_cons;
1398  kmp_team_t *team;
1399  kmp_team_t *parent_team;
1400  kmp_info_t *master_th;
1401  kmp_root_t *root;
1402  int nthreads;
1403  int master_active;
1404  int master_set_numthreads;
1405  int level;
1406 #if OMP_40_ENABLED
1407  int active_level;
1408  int teams_level;
1409 #endif
1410 #if KMP_NESTED_HOT_TEAMS
1411  kmp_hot_team_ptr_t **p_hot_teams;
1412 #endif
1413  { // KMP_TIME_BLOCK
1414  KMP_TIME_DEVELOPER_BLOCK(KMP_fork_call);
1415  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1416 
1417  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1418  if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
1419  /* Some systems prefer the stack for the root thread(s) to start with */
1420  /* some gap from the parent stack to prevent false sharing. */
1421  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1422  /* These 2 lines below are so this does not get optimized out */
1423  if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1424  __kmp_stkpadding += (short)((kmp_int64)dummy);
1425  }
1426 
1427  /* initialize if needed */
1428  KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1429  if( ! TCR_4(__kmp_init_parallel) )
1430  __kmp_parallel_initialize();
1431 
1432  /* setup current data */
1433  master_th = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1434  parent_team = master_th->th.th_team;
1435  master_tid = master_th->th.th_info.ds.ds_tid;
1436  master_this_cons = master_th->th.th_local.this_construct;
1437  root = master_th->th.th_root;
1438  master_active = root->r.r_active;
1439  master_set_numthreads = master_th->th.th_set_nproc;
1440 
1441 #if OMPT_SUPPORT
1442  ompt_parallel_id_t ompt_parallel_id;
1443  ompt_task_id_t ompt_task_id;
1444  ompt_frame_t *ompt_frame;
1445  ompt_task_id_t my_task_id;
1446  ompt_parallel_id_t my_parallel_id;
1447 
1448  if (ompt_enabled) {
1449  ompt_parallel_id = __ompt_parallel_id_new(gtid);
1450  ompt_task_id = __ompt_get_task_id_internal(0);
1451  ompt_frame = __ompt_get_task_frame_internal(0);
1452  }
1453 #endif
1454 
1455  // Nested level will be an index in the nested nthreads array
1456  level = parent_team->t.t_level;
1457  active_level = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1458 #if OMP_40_ENABLED
1459  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
1460 #endif
1461 #if KMP_NESTED_HOT_TEAMS
1462  p_hot_teams = &master_th->th.th_hot_teams;
1463  if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1464  *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1465  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1466  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1467  (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1468  }
1469 #endif
1470 
1471 #if OMPT_SUPPORT
1472  if (ompt_enabled &&
1473  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1474  int team_size = master_set_numthreads;
1475 
1476  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1477  ompt_task_id, ompt_frame, ompt_parallel_id,
1478  team_size, unwrapped_task, OMPT_INVOKER(call_context));
1479  }
1480 #endif
1481 
1482  master_th->th.th_ident = loc;
1483 
1484 #if OMP_40_ENABLED
1485  if ( master_th->th.th_teams_microtask &&
1486  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1487  // AC: This is start of parallel that is nested inside teams construct.
1488  // The team is actual (hot), all workers are ready at the fork barrier.
1489  // No lock needed to initialize the team a bit, then free workers.
1490  parent_team->t.t_ident = loc;
1491  __kmp_alloc_argv_entries( argc, parent_team, TRUE );
1492  parent_team->t.t_argc = argc;
1493  argv = (void**)parent_team->t.t_argv;
1494  for( i=argc-1; i >= 0; --i )
1495 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1496 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1497  *argv++ = va_arg( *ap, void * );
1498 #else
1499  *argv++ = va_arg( ap, void * );
1500 #endif
1501  /* Increment our nested depth levels, but not increase the serialization */
1502  if ( parent_team == master_th->th.th_serial_team ) {
1503  // AC: we are in serialized parallel
1504  __kmpc_serialized_parallel(loc, gtid);
1505  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1506  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1507  // work correctly, will restore at join time
1508 
1509 #if OMPT_SUPPORT
1510  void *dummy;
1511  void **exit_runtime_p;
1512 
1513  ompt_lw_taskteam_t lw_taskteam;
1514 
1515  if (ompt_enabled) {
1516  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1517  unwrapped_task, ompt_parallel_id);
1518  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1519  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1520 
1521  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1522 
1523 #if OMPT_TRACE
1524  /* OMPT implicit task begin */
1525  my_task_id = lw_taskteam.ompt_task_info.task_id;
1526  my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1527  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1528  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1529  my_parallel_id, my_task_id);
1530  }
1531 #endif
1532 
1533  /* OMPT state */
1534  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1535  } else {
1536  exit_runtime_p = &dummy;
1537  }
1538 #endif
1539 
1540  {
1541  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1542  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1543  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1544 #if OMPT_SUPPORT
1545  , exit_runtime_p
1546 #endif
1547  );
1548  }
1549 
1550 #if OMPT_SUPPORT
1551  if (ompt_enabled) {
1552 #if OMPT_TRACE
1553  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1554 
1555  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1556  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1557  ompt_parallel_id, ompt_task_id);
1558  }
1559 
1560  __ompt_lw_taskteam_unlink(master_th);
1561  // reset clear the task id only after unlinking the task
1562  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1563 #endif
1564 
1565  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1566  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1567  ompt_parallel_id, ompt_task_id,
1568  OMPT_INVOKER(call_context));
1569  }
1570  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1571  }
1572 #endif
1573  return TRUE;
1574  }
1575 
1576  parent_team->t.t_pkfn = microtask;
1577 #if OMPT_SUPPORT
1578  parent_team->t.ompt_team_info.microtask = unwrapped_task;
1579 #endif
1580  parent_team->t.t_invoke = invoker;
1581  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1582  parent_team->t.t_active_level ++;
1583  parent_team->t.t_level ++;
1584 
1585  /* Change number of threads in the team if requested */
1586  if ( master_set_numthreads ) { // The parallel has num_threads clause
1587  if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1588  // AC: only can reduce the number of threads dynamically, cannot increase
1589  kmp_info_t **other_threads = parent_team->t.t_threads;
1590  parent_team->t.t_nproc = master_set_numthreads;
1591  for ( i = 0; i < master_set_numthreads; ++i ) {
1592  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1593  }
1594  // Keep extra threads hot in the team for possible next parallels
1595  }
1596  master_th->th.th_set_nproc = 0;
1597  }
1598 
1599 #if USE_DEBUGGER
1600  if ( __kmp_debugging ) { // Let debugger override number of threads.
1601  int nth = __kmp_omp_num_threads( loc );
1602  if ( nth > 0 ) { // 0 means debugger does not want to change number of threads.
1603  master_set_numthreads = nth;
1604  }; // if
1605  }; // if
1606 #endif
1607 
1608  KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1609  __kmp_internal_fork( loc, gtid, parent_team );
1610  KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1611 
1612  /* Invoke microtask for MASTER thread */
1613  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1614  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1615 
1616  {
1617  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1618  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1619  if (! parent_team->t.t_invoke( gtid )) {
1620  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1621  }
1622  }
1623  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1624  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1625  KMP_MB(); /* Flush all pending memory write invalidates. */
1626 
1627  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1628 
1629  return TRUE;
1630  } // Parallel closely nested in teams construct
1631 #endif /* OMP_40_ENABLED */
1632 
1633 #if KMP_DEBUG
1634  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1635  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1636  }
1637 #endif
1638 
1639  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1640  nthreads = 1;
1641  } else {
1642 #if OMP_40_ENABLED
1643  int enter_teams = ((ap==NULL && active_level==0)||(ap && teams_level>0 && teams_level==level));
1644 #endif
1645  nthreads = master_set_numthreads ?
1646  master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1647 
1648  // Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
1649  // This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
1650  if (nthreads > 1) {
1651  if ( ( !get__nested(master_th) && (root->r.r_in_parallel
1652 #if OMP_40_ENABLED
1653  && !enter_teams
1654 #endif /* OMP_40_ENABLED */
1655  ) ) || ( __kmp_library == library_serial ) ) {
1656  KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1657  gtid, nthreads ));
1658  nthreads = 1;
1659  }
1660  }
1661  if ( nthreads > 1 ) {
1662  /* determine how many new threads we can use */
1663  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1664 
1665  nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1666 #if OMP_40_ENABLED
1667 /* AC: If we execute teams from parallel region (on host), then teams should be created
1668  but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1669  then teams and their threads should be created regardless of the nesting setting. */
1670  , enter_teams
1671 #endif /* OMP_40_ENABLED */
1672  );
1673  if ( nthreads == 1 ) {
1674  // Free lock for single thread execution here;
1675  // for multi-thread execution it will be freed later
1676  // after team of threads created and initialized
1677  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1678  }
1679  }
1680  }
1681  KMP_DEBUG_ASSERT( nthreads > 0 );
1682 
1683  /* If we temporarily changed the set number of threads then restore it now */
1684  master_th->th.th_set_nproc = 0;
1685 
1686  /* create a serialized parallel region? */
1687  if ( nthreads == 1 ) {
1688  /* josh todo: hypothetical question: what do we do for OS X*? */
1689 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1690  void * args[ argc ];
1691 #else
1692  void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1693 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1694 
1695  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1696 
1697  __kmpc_serialized_parallel(loc, gtid);
1698 
1699  if ( call_context == fork_context_intel ) {
1700  /* TODO this sucks, use the compiler itself to pass args! :) */
1701  master_th->th.th_serial_team->t.t_ident = loc;
1702 #if OMP_40_ENABLED
1703  if ( !ap ) {
1704  // revert change made in __kmpc_serialized_parallel()
1705  master_th->th.th_serial_team->t.t_level--;
1706  // Get args from parent team for teams construct
1707 
1708 #if OMPT_SUPPORT
1709  void *dummy;
1710  void **exit_runtime_p;
1711 
1712  ompt_lw_taskteam_t lw_taskteam;
1713 
1714  if (ompt_enabled) {
1715  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1716  unwrapped_task, ompt_parallel_id);
1717  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1718  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1719 
1720  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1721 
1722 #if OMPT_TRACE
1723  my_task_id = lw_taskteam.ompt_task_info.task_id;
1724  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1725  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1726  ompt_parallel_id, my_task_id);
1727  }
1728 #endif
1729 
1730  /* OMPT state */
1731  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1732  } else {
1733  exit_runtime_p = &dummy;
1734  }
1735 #endif
1736 
1737  {
1738  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1739  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1740  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1741 #if OMPT_SUPPORT
1742  , exit_runtime_p
1743 #endif
1744  );
1745  }
1746 
1747 #if OMPT_SUPPORT
1748  if (ompt_enabled) {
1749  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1750 
1751 #if OMPT_TRACE
1752  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1753  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1754  ompt_parallel_id, ompt_task_id);
1755  }
1756 #endif
1757 
1758  __ompt_lw_taskteam_unlink(master_th);
1759  // reset clear the task id only after unlinking the task
1760  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1761 
1762  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1763  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1764  ompt_parallel_id, ompt_task_id,
1765  OMPT_INVOKER(call_context));
1766  }
1767  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1768  }
1769 #endif
1770  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1771  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1772  team = master_th->th.th_team;
1773  //team->t.t_pkfn = microtask;
1774  team->t.t_invoke = invoker;
1775  __kmp_alloc_argv_entries( argc, team, TRUE );
1776  team->t.t_argc = argc;
1777  argv = (void**) team->t.t_argv;
1778  if ( ap ) {
1779  for( i=argc-1; i >= 0; --i )
1780 // TODO: revert workaround for Intel(R) 64 tracker #96
1781 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1782  *argv++ = va_arg( *ap, void * );
1783 # else
1784  *argv++ = va_arg( ap, void * );
1785 # endif
1786  } else {
1787  for( i=0; i < argc; ++i )
1788  // Get args from parent team for teams construct
1789  argv[i] = parent_team->t.t_argv[i];
1790  }
1791  // AC: revert change made in __kmpc_serialized_parallel()
1792  // because initial code in teams should have level=0
1793  team->t.t_level--;
1794  // AC: call special invoker for outer "parallel" of the teams construct
1795  {
1796  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1797  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1798  invoker(gtid);
1799  }
1800  } else {
1801 #endif /* OMP_40_ENABLED */
1802  argv = args;
1803  for( i=argc-1; i >= 0; --i )
1804 // TODO: revert workaround for Intel(R) 64 tracker #96
1805 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1806  *argv++ = va_arg( *ap, void * );
1807 #else
1808  *argv++ = va_arg( ap, void * );
1809 #endif
1810  KMP_MB();
1811 
1812 #if OMPT_SUPPORT
1813  void *dummy;
1814  void **exit_runtime_p;
1815 
1816  ompt_lw_taskteam_t lw_taskteam;
1817 
1818  if (ompt_enabled) {
1819  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1820  unwrapped_task, ompt_parallel_id);
1821  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1822  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1823 
1824  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1825 
1826 #if OMPT_TRACE
1827  /* OMPT implicit task begin */
1828  my_task_id = lw_taskteam.ompt_task_info.task_id;
1829  my_parallel_id = ompt_parallel_id;
1830  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1831  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1832  my_parallel_id, my_task_id);
1833  }
1834 #endif
1835 
1836  /* OMPT state */
1837  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1838  } else {
1839  exit_runtime_p = &dummy;
1840  }
1841 #endif
1842 
1843  {
1844  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1845  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1846  __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1847 #if OMPT_SUPPORT
1848  , exit_runtime_p
1849 #endif
1850  );
1851  }
1852 
1853 #if OMPT_SUPPORT
1854  if (ompt_enabled) {
1855 #if OMPT_TRACE
1856  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1857 
1858  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1859  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1860  my_parallel_id, my_task_id);
1861  }
1862 #endif
1863 
1864  __ompt_lw_taskteam_unlink(master_th);
1865  // reset clear the task id only after unlinking the task
1866  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1867 
1868  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1869  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1870  ompt_parallel_id, ompt_task_id,
1871  OMPT_INVOKER(call_context));
1872  }
1873  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874  }
1875 #endif
1876 #if OMP_40_ENABLED
1877  }
1878 #endif /* OMP_40_ENABLED */
1879  }
1880  else if ( call_context == fork_context_gnu ) {
1881 #if OMPT_SUPPORT
1882  ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1883  __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1884  __ompt_lw_taskteam_init(lwt, master_th, gtid,
1885  unwrapped_task, ompt_parallel_id);
1886 
1887  lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1888  lwt->ompt_task_info.frame.exit_runtime_frame = 0;
1889  __ompt_lw_taskteam_link(lwt, master_th);
1890 #endif
1891 
1892  // we were called from GNU native code
1893  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1894  return FALSE;
1895  }
1896  else {
1897  KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1898  }
1899 
1900 
1901  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1902  KMP_MB();
1903  return FALSE;
1904  }
1905 
1906  // GEH: only modify the executing flag in the case when not serialized
1907  // serialized case is handled in kmpc_serialized_parallel
1908  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1909  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1910  master_th->th.th_current_task->td_icvs.max_active_levels ) );
1911  // TODO: GEH - cannot do this assertion because root thread not set up as executing
1912  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1913  master_th->th.th_current_task->td_flags.executing = 0;
1914 
1915 #if OMP_40_ENABLED
1916  if ( !master_th->th.th_teams_microtask || level > teams_level )
1917 #endif /* OMP_40_ENABLED */
1918  {
1919  /* Increment our nested depth level */
1920  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1921  }
1922 
1923  // See if we need to make a copy of the ICVs.
1924  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1925  if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1926  nthreads_icv = __kmp_nested_nth.nth[level+1];
1927  }
1928  else {
1929  nthreads_icv = 0; // don't update
1930  }
1931 
1932 #if OMP_40_ENABLED
1933  // Figure out the proc_bind_policy for the new team.
1934  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1935  kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1936  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1937  proc_bind = proc_bind_false;
1938  }
1939  else {
1940  if (proc_bind == proc_bind_default) {
1941  // No proc_bind clause specified; use current proc-bind-var for this parallel region
1942  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1943  }
1944  /* else: The proc_bind policy was specified explicitly on parallel clause. This
1945  overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
1946  // Figure the value of proc-bind-var for the child threads.
1947  if ((level+1 < __kmp_nested_proc_bind.used)
1948  && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
1949  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
1950  }
1951  }
1952 
1953  // Reset for next parallel region
1954  master_th->th.th_set_proc_bind = proc_bind_default;
1955 #endif /* OMP_40_ENABLED */
1956 
1957  if ((nthreads_icv > 0)
1958 #if OMP_40_ENABLED
1959  || (proc_bind_icv != proc_bind_default)
1960 #endif /* OMP_40_ENABLED */
1961  ) {
1962  kmp_internal_control_t new_icvs;
1963  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1964  new_icvs.next = NULL;
1965  if (nthreads_icv > 0) {
1966  new_icvs.nproc = nthreads_icv;
1967  }
1968 
1969 #if OMP_40_ENABLED
1970  if (proc_bind_icv != proc_bind_default) {
1971  new_icvs.proc_bind = proc_bind_icv;
1972  }
1973 #endif /* OMP_40_ENABLED */
1974 
1975  /* allocate a new parallel team */
1976  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1977  team = __kmp_allocate_team(root, nthreads, nthreads,
1978 #if OMPT_SUPPORT
1979  ompt_parallel_id,
1980 #endif
1981 #if OMP_40_ENABLED
1982  proc_bind,
1983 #endif
1984  &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
1985  } else {
1986  /* allocate a new parallel team */
1987  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1988  team = __kmp_allocate_team(root, nthreads, nthreads,
1989 #if OMPT_SUPPORT
1990  ompt_parallel_id,
1991 #endif
1992 #if OMP_40_ENABLED
1993  proc_bind,
1994 #endif
1995  &master_th->th.th_current_task->td_icvs, argc
1996  USE_NESTED_HOT_ARG(master_th) );
1997  }
1998  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
1999 
2000  /* setup the new team */
2001  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2002  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2003  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2004  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2005  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2006 #if OMPT_SUPPORT
2007  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2008 #endif
2009  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); /* TODO move this to root, maybe */
2010  // TODO: parent_team->t.t_level == INT_MAX ???
2011 #if OMP_40_ENABLED
2012  if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2013 #endif /* OMP_40_ENABLED */
2014  int new_level = parent_team->t.t_level + 1;
2015  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2016  new_level = parent_team->t.t_active_level + 1;
2017  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2018 #if OMP_40_ENABLED
2019  } else {
2020  // AC: Do not increase parallel level at start of the teams construct
2021  int new_level = parent_team->t.t_level;
2022  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2023  new_level = parent_team->t.t_active_level;
2024  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2025  }
2026 #endif /* OMP_40_ENABLED */
2027  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2028  if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || team->t.t_sched.chunk != new_sched.chunk)
2029  team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
2030 
2031 #if OMP_40_ENABLED
2032  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2033 #endif
2034 
2035  // Update the floating point rounding in the team if required.
2036  propagateFPControl(team);
2037 
2038  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2039  // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2040 #if 0
2041  // Patch out an assertion that trips while the runtime seems to operate correctly.
2042  // Avoiding the preconditions that cause the assertion to trip has been promised as a forthcoming patch.
2043  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2044 #endif
2045  KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2046  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2047  parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2048 
2049  if ( active_level || master_th->th.th_task_team ) {
2050  // Take a memo of master's task_state
2051  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2052  if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2053  kmp_uint32 new_size = 2*master_th->th.th_task_state_stack_sz;
2054  kmp_uint8 *old_stack, *new_stack;
2055  kmp_uint32 i;
2056  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2057  for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2058  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2059  }
2060  for (i=master_th->th.th_task_state_stack_sz; i<new_size; ++i) { // zero-init rest of stack
2061  new_stack[i] = 0;
2062  }
2063  old_stack = master_th->th.th_task_state_memo_stack;
2064  master_th->th.th_task_state_memo_stack = new_stack;
2065  master_th->th.th_task_state_stack_sz = new_size;
2066  __kmp_free(old_stack);
2067  }
2068  // Store master's task_state on stack
2069  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2070  master_th->th.th_task_state_top++;
2071 #if KMP_NESTED_HOT_TEAMS
2072  if (team == master_th->th.th_hot_teams[active_level].hot_team) { // Restore master's nested state if nested hot team
2073  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2074  }
2075  else {
2076 #endif
2077  master_th->th.th_task_state = 0;
2078 #if KMP_NESTED_HOT_TEAMS
2079  }
2080 #endif
2081  }
2082 #if !KMP_NESTED_HOT_TEAMS
2083  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2084 #endif
2085  }
2086 
2087  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2088  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2089  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2090  ( team->t.t_master_tid == 0 &&
2091  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2092  KMP_MB();
2093 
2094  /* now, setup the arguments */
2095  argv = (void**)team->t.t_argv;
2096 #if OMP_40_ENABLED
2097  if ( ap ) {
2098 #endif /* OMP_40_ENABLED */
2099  for ( i=argc-1; i >= 0; --i ) {
2100 // TODO: revert workaround for Intel(R) 64 tracker #96
2101 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2102  void *new_argv = va_arg(*ap, void *);
2103 #else
2104  void *new_argv = va_arg(ap, void *);
2105 #endif
2106  KMP_CHECK_UPDATE(*argv, new_argv);
2107  argv++;
2108  }
2109 #if OMP_40_ENABLED
2110  } else {
2111  for ( i=0; i < argc; ++i ) {
2112  // Get args from parent team for teams construct
2113  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2114  }
2115  }
2116 #endif /* OMP_40_ENABLED */
2117 
2118  /* now actually fork the threads */
2119  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2120  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2121  root->r.r_active = TRUE;
2122 
2123  __kmp_fork_team_threads( root, team, master_th, gtid );
2124  __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2125 
2126 #if OMPT_SUPPORT
2127  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2128 #endif
2129 
2130  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2131 
2132 #if USE_ITT_BUILD
2133  if ( team->t.t_active_level == 1 // only report frames at level 1
2134 # if OMP_40_ENABLED
2135  && !master_th->th.th_teams_microtask // not in teams construct
2136 # endif /* OMP_40_ENABLED */
2137  ) {
2138 #if USE_ITT_NOTIFY
2139  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2140  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2141  {
2142  kmp_uint64 tmp_time = 0;
2143  if ( __itt_get_timestamp_ptr )
2144  tmp_time = __itt_get_timestamp();
2145  // Internal fork - report frame begin
2146  master_th->th.th_frame_time = tmp_time;
2147  if ( __kmp_forkjoin_frames_mode == 3 )
2148  team->t.t_region_time = tmp_time;
2149  } else // only one notification scheme (either "submit" or "forking/joined", not both)
2150 #endif /* USE_ITT_NOTIFY */
2151  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2152  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2153  { // Mark start of "parallel" region for VTune.
2154  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2155  }
2156  }
2157 #endif /* USE_ITT_BUILD */
2158 
2159  /* now go on and do the work */
2160  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2161  KMP_MB();
2162  KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2163  root, team, master_th, gtid));
2164 
2165 #if USE_ITT_BUILD
2166  if ( __itt_stack_caller_create_ptr ) {
2167  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2168  }
2169 #endif /* USE_ITT_BUILD */
2170 
2171 #if OMP_40_ENABLED
2172  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2173 #endif /* OMP_40_ENABLED */
2174  {
2175  __kmp_internal_fork( loc, gtid, team );
2176  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2177  root, team, master_th, gtid));
2178  }
2179 
2180  if (call_context == fork_context_gnu) {
2181  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2182  return TRUE;
2183  }
2184 
2185  /* Invoke microtask for MASTER thread */
2186  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2187  gtid, team->t.t_id, team->t.t_pkfn ) );
2188  } // END of timer KMP_fork_call block
2189 
2190  {
2191  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2192  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2193  // KMP_TIME_DEVELOPER_BLOCK(USER_master_invoke);
2194  if (! team->t.t_invoke( gtid )) {
2195  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2196  }
2197  }
2198  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2199  gtid, team->t.t_id, team->t.t_pkfn ) );
2200  KMP_MB(); /* Flush all pending memory write invalidates. */
2201 
2202  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2203 
2204 #if OMPT_SUPPORT
2205  if (ompt_enabled) {
2206  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2207  }
2208 #endif
2209 
2210  return TRUE;
2211 }
2212 
2213 #if OMPT_SUPPORT
2214 static inline void
2215 __kmp_join_restore_state(
2216  kmp_info_t *thread,
2217  kmp_team_t *team)
2218 {
2219  // restore state outside the region
2220  thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2221  ompt_state_work_serial : ompt_state_work_parallel);
2222 }
2223 
2224 static inline void
2225 __kmp_join_ompt(
2226  kmp_info_t *thread,
2227  kmp_team_t *team,
2228  ompt_parallel_id_t parallel_id,
2229  fork_context_e fork_context)
2230 {
2231  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2232  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2233  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2234  parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2235  }
2236 
2237  __kmp_join_restore_state(thread,team);
2238 }
2239 #endif
2240 
2241 void
2242 __kmp_join_call(ident_t *loc, int gtid
2243 #if OMPT_SUPPORT
2244  , enum fork_context_e fork_context
2245 #endif
2246 #if OMP_40_ENABLED
2247  , int exit_teams
2248 #endif /* OMP_40_ENABLED */
2249 )
2250 {
2251  KMP_TIME_DEVELOPER_BLOCK(KMP_join_call);
2252  kmp_team_t *team;
2253  kmp_team_t *parent_team;
2254  kmp_info_t *master_th;
2255  kmp_root_t *root;
2256  int master_active;
2257  int i;
2258 
2259  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2260 
2261  /* setup current data */
2262  master_th = __kmp_threads[ gtid ];
2263  root = master_th->th.th_root;
2264  team = master_th->th.th_team;
2265  parent_team = team->t.t_parent;
2266 
2267  master_th->th.th_ident = loc;
2268 
2269 #if OMPT_SUPPORT
2270  if (ompt_enabled) {
2271  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2272  }
2273 #endif
2274 
2275 #if KMP_DEBUG
2276  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2277  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2278  __kmp_gtid_from_thread( master_th ), team,
2279  team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2280  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2281  }
2282 #endif
2283 
2284  if( team->t.t_serialized ) {
2285 #if OMP_40_ENABLED
2286  if ( master_th->th.th_teams_microtask ) {
2287  // We are in teams construct
2288  int level = team->t.t_level;
2289  int tlevel = master_th->th.th_teams_level;
2290  if ( level == tlevel ) {
2291  // AC: we haven't incremented it earlier at start of teams construct,
2292  // so do it here - at the end of teams construct
2293  team->t.t_level++;
2294  } else if ( level == tlevel + 1 ) {
2295  // AC: we are exiting parallel inside teams, need to increment serialization
2296  // in order to restore it in the next call to __kmpc_end_serialized_parallel
2297  team->t.t_serialized++;
2298  }
2299  }
2300 #endif /* OMP_40_ENABLED */
2301  __kmpc_end_serialized_parallel( loc, gtid );
2302 
2303 #if OMPT_SUPPORT
2304  if (ompt_enabled) {
2305  __kmp_join_restore_state(master_th, parent_team);
2306  }
2307 #endif
2308 
2309  return;
2310  }
2311 
2312  master_active = team->t.t_master_active;
2313 
2314 #if OMP_40_ENABLED
2315  if (!exit_teams)
2316 #endif /* OMP_40_ENABLED */
2317  {
2318  // AC: No barrier for internal teams at exit from teams construct.
2319  // But there is barrier for external team (league).
2320  __kmp_internal_join( loc, gtid, team );
2321  }
2322 #if OMP_40_ENABLED
2323  else {
2324  master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2325  }
2326 #endif /* OMP_40_ENABLED */
2327 
2328  KMP_MB();
2329 
2330 #if OMPT_SUPPORT
2331  ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2332 #endif
2333 
2334 #if USE_ITT_BUILD
2335  if ( __itt_stack_caller_create_ptr ) {
2336  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2337  }
2338 
2339  // Mark end of "parallel" region for VTune.
2340  if ( team->t.t_active_level == 1
2341 # if OMP_40_ENABLED
2342  && !master_th->th.th_teams_microtask /* not in teams construct */
2343 # endif /* OMP_40_ENABLED */
2344  ) {
2345  master_th->th.th_ident = loc;
2346  // only one notification scheme (either "submit" or "forking/joined", not both)
2347  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2348  __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2349  0, loc, master_th->th.th_team_nproc, 1 );
2350  else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2351  ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2352  __kmp_itt_region_joined( gtid );
2353  } // active_level == 1
2354 #endif /* USE_ITT_BUILD */
2355 
2356 #if OMP_40_ENABLED
2357  if ( master_th->th.th_teams_microtask &&
2358  !exit_teams &&
2359  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2360  team->t.t_level == master_th->th.th_teams_level + 1 ) {
2361  // AC: We need to leave the team structure intact at the end
2362  // of parallel inside the teams construct, so that at the next
2363  // parallel same (hot) team works, only adjust nesting levels
2364 
2365  /* Decrement our nested depth level */
2366  team->t.t_level --;
2367  team->t.t_active_level --;
2368  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2369 
2370  /* Restore number of threads in the team if needed */
2371  if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2372  int old_num = master_th->th.th_team_nproc;
2373  int new_num = master_th->th.th_teams_size.nth;
2374  kmp_info_t **other_threads = team->t.t_threads;
2375  team->t.t_nproc = new_num;
2376  for ( i = 0; i < old_num; ++i ) {
2377  other_threads[i]->th.th_team_nproc = new_num;
2378  }
2379  // Adjust states of non-used threads of the team
2380  for ( i = old_num; i < new_num; ++i ) {
2381  // Re-initialize thread's barrier data.
2382  int b;
2383  kmp_balign_t * balign = other_threads[i]->th.th_bar;
2384  for ( b = 0; b < bs_last_barrier; ++ b ) {
2385  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2386  KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2387 #if USE_DEBUGGER
2388  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2389 #endif
2390  }
2391  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2392  // Synchronize thread's task state
2393  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2394  }
2395  }
2396  }
2397 
2398 #if OMPT_SUPPORT
2399  if (ompt_enabled) {
2400  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2401  }
2402 #endif
2403 
2404  return;
2405  }
2406 #endif /* OMP_40_ENABLED */
2407 
2408  /* do cleanup and restore the parent team */
2409  master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2410  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2411 
2412  master_th->th.th_dispatch =
2413  & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2414 
2415  /* jc: The following lock has instructions with REL and ACQ semantics,
2416  separating the parallel user code called in this parallel region
2417  from the serial user code called after this function returns.
2418  */
2419  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2420 
2421 #if OMP_40_ENABLED
2422  if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2423 #endif /* OMP_40_ENABLED */
2424  {
2425  /* Decrement our nested depth level */
2426  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2427  }
2428  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2429 
2430 #if OMPT_SUPPORT && OMPT_TRACE
2431  if(ompt_enabled){
2432  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2433  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2434  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2435  parallel_id, task_info->task_id);
2436  }
2437  task_info->frame.exit_runtime_frame = 0;
2438  task_info->task_id = 0;
2439  }
2440 #endif
2441 
2442  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2443  0, master_th, team ) );
2444  __kmp_pop_current_task_from_thread( master_th );
2445 
2446 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2447  //
2448  // Restore master thread's partition.
2449  //
2450  master_th->th.th_first_place = team->t.t_first_place;
2451  master_th->th.th_last_place = team->t.t_last_place;
2452 #endif /* OMP_40_ENABLED */
2453 
2454  updateHWFPControl (team);
2455 
2456  if ( root->r.r_active != master_active )
2457  root->r.r_active = master_active;
2458 
2459  __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2460 
2461  /* this race was fun to find. make sure the following is in the critical
2462  * region otherwise assertions may fail occasionally since the old team
2463  * may be reallocated and the hierarchy appears inconsistent. it is
2464  * actually safe to run and won't cause any bugs, but will cause those
2465  * assertion failures. it's only one deref&assign so might as well put this
2466  * in the critical region */
2467  master_th->th.th_team = parent_team;
2468  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2469  master_th->th.th_team_master = parent_team->t.t_threads[0];
2470  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2471 
2472  /* restore serialized team, if need be */
2473  if( parent_team->t.t_serialized &&
2474  parent_team != master_th->th.th_serial_team &&
2475  parent_team != root->r.r_root_team ) {
2476  __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2477  master_th->th.th_serial_team = parent_team;
2478  }
2479 
2480  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2481  if (master_th->th.th_task_state_top > 0) { // Restore task state from memo stack
2482  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2483  // Remember master's state if we re-use this nested hot team
2484  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2485  --master_th->th.th_task_state_top; // pop
2486  // Now restore state at this level
2487  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2488  }
2489  // Copy the task team from the parent team to the master thread
2490  master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2491  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2492  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team, parent_team ) );
2493  }
2494 
2495  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2496  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2497  master_th->th.th_current_task->td_flags.executing = 1;
2498 
2499  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2500 
2501 #if OMPT_SUPPORT
2502  if (ompt_enabled) {
2503  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2504  }
2505 #endif
2506 
2507  KMP_MB();
2508  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2509 }
2510 
2511 /* ------------------------------------------------------------------------ */
2512 /* ------------------------------------------------------------------------ */
2513 
2514 /* Check whether we should push an internal control record onto the
2515  serial team stack. If so, do it. */
2516 void
2517 __kmp_save_internal_controls ( kmp_info_t * thread )
2518 {
2519 
2520  if ( thread->th.th_team != thread->th.th_serial_team ) {
2521  return;
2522  }
2523  if (thread->th.th_team->t.t_serialized > 1) {
2524  int push = 0;
2525 
2526  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2527  push = 1;
2528  } else {
2529  if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2530  thread->th.th_team->t.t_serialized ) {
2531  push = 1;
2532  }
2533  }
2534  if (push) { /* push a record on the serial team's stack */
2535  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2536 
2537  copy_icvs( control, & thread->th.th_current_task->td_icvs );
2538 
2539  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2540 
2541  control->next = thread->th.th_team->t.t_control_stack_top;
2542  thread->th.th_team->t.t_control_stack_top = control;
2543  }
2544  }
2545 }
2546 
2547 /* Changes set_nproc */
2548 void
2549 __kmp_set_num_threads( int new_nth, int gtid )
2550 {
2551  kmp_info_t *thread;
2552  kmp_root_t *root;
2553 
2554  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2555  KMP_DEBUG_ASSERT( __kmp_init_serial );
2556 
2557  if (new_nth < 1)
2558  new_nth = 1;
2559  else if (new_nth > __kmp_max_nth)
2560  new_nth = __kmp_max_nth;
2561 
2562  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2563  thread = __kmp_threads[gtid];
2564 
2565  __kmp_save_internal_controls( thread );
2566 
2567  set__nproc( thread, new_nth );
2568 
2569  //
2570  // If this omp_set_num_threads() call will cause the hot team size to be
2571  // reduced (in the absence of a num_threads clause), then reduce it now,
2572  // rather than waiting for the next parallel region.
2573  //
2574  root = thread->th.th_root;
2575  if ( __kmp_init_parallel && ( ! root->r.r_active )
2576  && ( root->r.r_hot_team->t.t_nproc > new_nth )
2577 #if KMP_NESTED_HOT_TEAMS
2578  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2579 #endif
2580  ) {
2581  kmp_team_t *hot_team = root->r.r_hot_team;
2582  int f;
2583 
2584  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2585 
2586  // Release the extra threads we don't need any more.
2587  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
2588  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2589  if ( __kmp_tasking_mode != tskm_immediate_exec) {
2590  // When decreasing team size, threads no longer in the team should unref task team.
2591  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2592  }
2593  __kmp_free_thread( hot_team->t.t_threads[f] );
2594  hot_team->t.t_threads[f] = NULL;
2595  }
2596  hot_team->t.t_nproc = new_nth;
2597 #if KMP_NESTED_HOT_TEAMS
2598  if( thread->th.th_hot_teams ) {
2599  KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2600  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2601  }
2602 #endif
2603 
2604  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2605 
2606  //
2607  // Update the t_nproc field in the threads that are still active.
2608  //
2609  for( f=0 ; f < new_nth; f++ ) {
2610  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2611  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2612  }
2613  // Special flag in case omp_set_num_threads() call
2614  hot_team->t.t_size_changed = -1;
2615  }
2616 }
2617 
2618 /* Changes max_active_levels */
2619 void
2620 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2621 {
2622  kmp_info_t *thread;
2623 
2624  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2625  KMP_DEBUG_ASSERT( __kmp_init_serial );
2626 
2627  // validate max_active_levels
2628  if( max_active_levels < 0 ) {
2629  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2630  // We ignore this call if the user has specified a negative value.
2631  // The current setting won't be changed. The last valid setting will be used.
2632  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2633  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2634  return;
2635  }
2636  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2637  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2638  // We allow a zero value. (implementation defined behavior)
2639  } else {
2640  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
2641  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2642  // Current upper limit is MAX_INT. (implementation defined behavior)
2643  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2644  // Actually, the flow should never get here until we use MAX_INT limit.
2645  }
2646  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2647 
2648  thread = __kmp_threads[ gtid ];
2649 
2650  __kmp_save_internal_controls( thread );
2651 
2652  set__max_active_levels( thread, max_active_levels );
2653 
2654 }
2655 
2656 /* Gets max_active_levels */
2657 int
2658 __kmp_get_max_active_levels( int gtid )
2659 {
2660  kmp_info_t *thread;
2661 
2662  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2663  KMP_DEBUG_ASSERT( __kmp_init_serial );
2664 
2665  thread = __kmp_threads[ gtid ];
2666  KMP_DEBUG_ASSERT( thread->th.th_current_task );
2667  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2668  gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2669  return thread->th.th_current_task->td_icvs.max_active_levels;
2670 }
2671 
2672 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2673 void
2674 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2675 {
2676  kmp_info_t *thread;
2677 // kmp_team_t *team;
2678 
2679  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2680  KMP_DEBUG_ASSERT( __kmp_init_serial );
2681 
2682  // Check if the kind parameter is valid, correct if needed.
2683  // Valid parameters should fit in one of two intervals - standard or extended:
2684  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2685  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2686  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2687  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2688  {
2689  // TODO: Hint needs attention in case we change the default schedule.
2690  __kmp_msg(
2691  kmp_ms_warning,
2692  KMP_MSG( ScheduleKindOutOfRange, kind ),
2693  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2694  __kmp_msg_null
2695  );
2696  kind = kmp_sched_default;
2697  chunk = 0; // ignore chunk value in case of bad kind
2698  }
2699 
2700  thread = __kmp_threads[ gtid ];
2701 
2702  __kmp_save_internal_controls( thread );
2703 
2704  if ( kind < kmp_sched_upper_std ) {
2705  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2706  // differ static chunked vs. unchunked:
2707  // chunk should be invalid to indicate unchunked schedule (which is the default)
2708  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2709  } else {
2710  thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2711  }
2712  } else {
2713  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2714  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2715  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2716  }
2717  if ( kind == kmp_sched_auto ) {
2718  // ignore parameter chunk for schedule auto
2719  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2720  } else {
2721  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2722  }
2723 }
2724 
2725 /* Gets def_sched_var ICV values */
2726 void
2727 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2728 {
2729  kmp_info_t *thread;
2730  enum sched_type th_type;
2731 
2732  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2733  KMP_DEBUG_ASSERT( __kmp_init_serial );
2734 
2735  thread = __kmp_threads[ gtid ];
2736 
2737  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2738 
2739  switch ( th_type ) {
2740  case kmp_sch_static:
2741  case kmp_sch_static_greedy:
2742  case kmp_sch_static_balanced:
2743  *kind = kmp_sched_static;
2744  *chunk = 0; // chunk was not set, try to show this fact via zero value
2745  return;
2746  case kmp_sch_static_chunked:
2747  *kind = kmp_sched_static;
2748  break;
2749  case kmp_sch_dynamic_chunked:
2750  *kind = kmp_sched_dynamic;
2751  break;
2753  case kmp_sch_guided_iterative_chunked:
2754  case kmp_sch_guided_analytical_chunked:
2755  *kind = kmp_sched_guided;
2756  break;
2757  case kmp_sch_auto:
2758  *kind = kmp_sched_auto;
2759  break;
2760  case kmp_sch_trapezoidal:
2761  *kind = kmp_sched_trapezoidal;
2762  break;
2763 /*
2764  case kmp_sch_static_steal:
2765  *kind = kmp_sched_static_steal;
2766  break;
2767 */
2768  default:
2769  KMP_FATAL( UnknownSchedulingType, th_type );
2770  }
2771 
2772  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2773 }
2774 
2775 int
2776 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2777 
2778  int ii, dd;
2779  kmp_team_t *team;
2780  kmp_info_t *thr;
2781 
2782  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2783  KMP_DEBUG_ASSERT( __kmp_init_serial );
2784 
2785  // validate level
2786  if( level == 0 ) return 0;
2787  if( level < 0 ) return -1;
2788  thr = __kmp_threads[ gtid ];
2789  team = thr->th.th_team;
2790  ii = team->t.t_level;
2791  if( level > ii ) return -1;
2792 
2793 #if OMP_40_ENABLED
2794  if( thr->th.th_teams_microtask ) {
2795  // AC: we are in teams region where multiple nested teams have same level
2796  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2797  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2798  KMP_DEBUG_ASSERT( ii >= tlevel );
2799  // AC: As we need to pass by the teams league, we need to artificially increase ii
2800  if ( ii == tlevel ) {
2801  ii += 2; // three teams have same level
2802  } else {
2803  ii ++; // two teams have same level
2804  }
2805  }
2806  }
2807 #endif
2808 
2809  if( ii == level ) return __kmp_tid_from_gtid( gtid );
2810 
2811  dd = team->t.t_serialized;
2812  level++;
2813  while( ii > level )
2814  {
2815  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2816  {
2817  }
2818  if( ( team->t.t_serialized ) && ( !dd ) ) {
2819  team = team->t.t_parent;
2820  continue;
2821  }
2822  if( ii > level ) {
2823  team = team->t.t_parent;
2824  dd = team->t.t_serialized;
2825  ii--;
2826  }
2827  }
2828 
2829  return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2830 }
2831 
2832 int
2833 __kmp_get_team_size( int gtid, int level ) {
2834 
2835  int ii, dd;
2836  kmp_team_t *team;
2837  kmp_info_t *thr;
2838 
2839  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2840  KMP_DEBUG_ASSERT( __kmp_init_serial );
2841 
2842  // validate level
2843  if( level == 0 ) return 1;
2844  if( level < 0 ) return -1;
2845  thr = __kmp_threads[ gtid ];
2846  team = thr->th.th_team;
2847  ii = team->t.t_level;
2848  if( level > ii ) return -1;
2849 
2850 #if OMP_40_ENABLED
2851  if( thr->th.th_teams_microtask ) {
2852  // AC: we are in teams region where multiple nested teams have same level
2853  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2854  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2855  KMP_DEBUG_ASSERT( ii >= tlevel );
2856  // AC: As we need to pass by the teams league, we need to artificially increase ii
2857  if ( ii == tlevel ) {
2858  ii += 2; // three teams have same level
2859  } else {
2860  ii ++; // two teams have same level
2861  }
2862  }
2863  }
2864 #endif
2865 
2866  while( ii > level )
2867  {
2868  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2869  {
2870  }
2871  if( team->t.t_serialized && ( !dd ) ) {
2872  team = team->t.t_parent;
2873  continue;
2874  }
2875  if( ii > level ) {
2876  team = team->t.t_parent;
2877  ii--;
2878  }
2879  }
2880 
2881  return team->t.t_nproc;
2882 }
2883 
2884 kmp_r_sched_t
2885 __kmp_get_schedule_global() {
2886 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2887 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2888 
2889  kmp_r_sched_t r_sched;
2890 
2891  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2892  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2893  // and thus have different run-time schedules in different roots (even in OMP 2.5)
2894  if ( __kmp_sched == kmp_sch_static ) {
2895  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2896  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2897  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2898  } else {
2899  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2900  }
2901 
2902  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2903  r_sched.chunk = KMP_DEFAULT_CHUNK;
2904  } else {
2905  r_sched.chunk = __kmp_chunk;
2906  }
2907 
2908  return r_sched;
2909 }
2910 
2911 /* ------------------------------------------------------------------------ */
2912 /* ------------------------------------------------------------------------ */
2913 
2914 
2915 /*
2916  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2917  * at least argc number of *t_argv entries for the requested team.
2918  */
2919 static void
2920 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2921 {
2922 
2923  KMP_DEBUG_ASSERT( team );
2924  if( !realloc || argc > team->t.t_max_argc ) {
2925 
2926  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2927  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2928  /* if previously allocated heap space for args, free them */
2929  if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2930  __kmp_free( (void *) team->t.t_argv );
2931 
2932  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2933  /* use unused space in the cache line for arguments */
2934  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2935  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2936  team->t.t_id, team->t.t_max_argc ));
2937  team->t.t_argv = &team->t.t_inline_argv[0];
2938  if ( __kmp_storage_map ) {
2939  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2940  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2941  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2942  "team_%d.t_inline_argv",
2943  team->t.t_id );
2944  }
2945  } else {
2946  /* allocate space for arguments in the heap */
2947  team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2948  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2949  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2950  team->t.t_id, team->t.t_max_argc ));
2951  team->t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2952  if ( __kmp_storage_map ) {
2953  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2954  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2955  team->t.t_id );
2956  }
2957  }
2958  }
2959 }
2960 
2961 static void
2962 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
2963 {
2964  int i;
2965  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2966  team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
2967  team->t.t_disp_buffer = (dispatch_shared_info_t*)
2968  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
2969  team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
2970  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
2971  team->t.t_max_nproc = max_nth;
2972 
2973  /* setup dispatch buffers */
2974  for(i = 0 ; i < num_disp_buff; ++i) {
2975  team->t.t_disp_buffer[i].buffer_index = i;
2976 #if OMP_45_ENABLED
2977  team->t.t_disp_buffer[i].doacross_buf_idx = i;
2978 #endif
2979  }
2980 }
2981 
2982 static void
2983 __kmp_free_team_arrays(kmp_team_t *team) {
2984  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
2985  int i;
2986  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
2987  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
2988  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
2989  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
2990  }; // if
2991  }; // for
2992  __kmp_free(team->t.t_threads);
2993  __kmp_free(team->t.t_disp_buffer);
2994  __kmp_free(team->t.t_dispatch);
2995  __kmp_free(team->t.t_implicit_task_taskdata);
2996  team->t.t_threads = NULL;
2997  team->t.t_disp_buffer = NULL;
2998  team->t.t_dispatch = NULL;
2999  team->t.t_implicit_task_taskdata = 0;
3000 }
3001 
3002 static void
3003 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3004  kmp_info_t **oldThreads = team->t.t_threads;
3005 
3006  __kmp_free(team->t.t_disp_buffer);
3007  __kmp_free(team->t.t_dispatch);
3008  __kmp_free(team->t.t_implicit_task_taskdata);
3009  __kmp_allocate_team_arrays(team, max_nth);
3010 
3011  KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3012 
3013  __kmp_free(oldThreads);
3014 }
3015 
3016 static kmp_internal_control_t
3017 __kmp_get_global_icvs( void ) {
3018 
3019  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3020 
3021 #if OMP_40_ENABLED
3022  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3023 #endif /* OMP_40_ENABLED */
3024 
3025  kmp_internal_control_t g_icvs = {
3026  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3027  (kmp_int8)__kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
3028  (kmp_int8)__kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
3029  (kmp_int8)__kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
3030  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
3031  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
3032  __kmp_dflt_team_nth, //int nproc; //internal control for # of threads for next parallel region (per thread)
3033  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3034  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
3035  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
3036 #if OMP_40_ENABLED
3037  __kmp_nested_proc_bind.bind_types[0],
3038 #endif /* OMP_40_ENABLED */
3039  NULL //struct kmp_internal_control *next;
3040  };
3041 
3042  return g_icvs;
3043 }
3044 
3045 static kmp_internal_control_t
3046 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3047 
3048  kmp_internal_control_t gx_icvs;
3049  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3050  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3051  gx_icvs.next = NULL;
3052 
3053  return gx_icvs;
3054 }
3055 
3056 static void
3057 __kmp_initialize_root( kmp_root_t *root )
3058 {
3059  int f;
3060  kmp_team_t *root_team;
3061  kmp_team_t *hot_team;
3062  int hot_team_max_nth;
3063  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3064  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3065  KMP_DEBUG_ASSERT( root );
3066  KMP_ASSERT( ! root->r.r_begin );
3067 
3068  /* setup the root state structure */
3069  __kmp_init_lock( &root->r.r_begin_lock );
3070  root->r.r_begin = FALSE;
3071  root->r.r_active = FALSE;
3072  root->r.r_in_parallel = 0;
3073  root->r.r_blocktime = __kmp_dflt_blocktime;
3074  root->r.r_nested = __kmp_dflt_nested;
3075 
3076  /* setup the root team for this task */
3077  /* allocate the root team structure */
3078  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3079 
3080  root_team =
3081  __kmp_allocate_team(
3082  root,
3083  1, // new_nproc
3084  1, // max_nproc
3085 #if OMPT_SUPPORT
3086  0, // root parallel id
3087 #endif
3088 #if OMP_40_ENABLED
3089  __kmp_nested_proc_bind.bind_types[0],
3090 #endif
3091  &r_icvs,
3092  0 // argc
3093  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3094  );
3095 #if USE_DEBUGGER
3096  // Non-NULL value should be assigned to make the debugger display the root team.
3097  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3098 #endif
3099 
3100  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3101 
3102  root->r.r_root_team = root_team;
3103  root_team->t.t_control_stack_top = NULL;
3104 
3105  /* initialize root team */
3106  root_team->t.t_threads[0] = NULL;
3107  root_team->t.t_nproc = 1;
3108  root_team->t.t_serialized = 1;
3109  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3110  root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3111  root_team->t.t_sched.chunk = r_sched.chunk;
3112  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3113  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3114 
3115  /* setup the hot team for this task */
3116  /* allocate the hot team structure */
3117  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3118 
3119  hot_team =
3120  __kmp_allocate_team(
3121  root,
3122  1, // new_nproc
3123  __kmp_dflt_team_nth_ub * 2, // max_nproc
3124 #if OMPT_SUPPORT
3125  0, // root parallel id
3126 #endif
3127 #if OMP_40_ENABLED
3128  __kmp_nested_proc_bind.bind_types[0],
3129 #endif
3130  &r_icvs,
3131  0 // argc
3132  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3133  );
3134  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3135 
3136  root->r.r_hot_team = hot_team;
3137  root_team->t.t_control_stack_top = NULL;
3138 
3139  /* first-time initialization */
3140  hot_team->t.t_parent = root_team;
3141 
3142  /* initialize hot team */
3143  hot_team_max_nth = hot_team->t.t_max_nproc;
3144  for ( f = 0; f < hot_team_max_nth; ++ f ) {
3145  hot_team->t.t_threads[ f ] = NULL;
3146  }; // for
3147  hot_team->t.t_nproc = 1;
3148  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3149  hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3150  hot_team->t.t_sched.chunk = r_sched.chunk;
3151  hot_team->t.t_size_changed = 0;
3152 }
3153 
3154 #ifdef KMP_DEBUG
3155 
3156 
3157 typedef struct kmp_team_list_item {
3158  kmp_team_p const * entry;
3159  struct kmp_team_list_item * next;
3160 } kmp_team_list_item_t;
3161 typedef kmp_team_list_item_t * kmp_team_list_t;
3162 
3163 
3164 static void
3165 __kmp_print_structure_team_accum( // Add team to list of teams.
3166  kmp_team_list_t list, // List of teams.
3167  kmp_team_p const * team // Team to add.
3168 ) {
3169 
3170  // List must terminate with item where both entry and next are NULL.
3171  // Team is added to the list only once.
3172  // List is sorted in ascending order by team id.
3173  // Team id is *not* a key.
3174 
3175  kmp_team_list_t l;
3176 
3177  KMP_DEBUG_ASSERT( list != NULL );
3178  if ( team == NULL ) {
3179  return;
3180  }; // if
3181 
3182  __kmp_print_structure_team_accum( list, team->t.t_parent );
3183  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3184 
3185  // Search list for the team.
3186  l = list;
3187  while ( l->next != NULL && l->entry != team ) {
3188  l = l->next;
3189  }; // while
3190  if ( l->next != NULL ) {
3191  return; // Team has been added before, exit.
3192  }; // if
3193 
3194  // Team is not found. Search list again for insertion point.
3195  l = list;
3196  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3197  l = l->next;
3198  }; // while
3199 
3200  // Insert team.
3201  {
3202  kmp_team_list_item_t * item =
3203  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3204  * item = * l;
3205  l->entry = team;
3206  l->next = item;
3207  }
3208 
3209 }
3210 
3211 static void
3212 __kmp_print_structure_team(
3213  char const * title,
3214  kmp_team_p const * team
3215 
3216 ) {
3217  __kmp_printf( "%s", title );
3218  if ( team != NULL ) {
3219  __kmp_printf( "%2x %p\n", team->t.t_id, team );
3220  } else {
3221  __kmp_printf( " - (nil)\n" );
3222  }; // if
3223 }
3224 
3225 static void
3226 __kmp_print_structure_thread(
3227  char const * title,
3228  kmp_info_p const * thread
3229 
3230 ) {
3231  __kmp_printf( "%s", title );
3232  if ( thread != NULL ) {
3233  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3234  } else {
3235  __kmp_printf( " - (nil)\n" );
3236  }; // if
3237 }
3238 
3239 void
3240 __kmp_print_structure(
3241  void
3242 ) {
3243 
3244  kmp_team_list_t list;
3245 
3246  // Initialize list of teams.
3247  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3248  list->entry = NULL;
3249  list->next = NULL;
3250 
3251  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3252  {
3253  int gtid;
3254  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3255  __kmp_printf( "%2d", gtid );
3256  if ( __kmp_threads != NULL ) {
3257  __kmp_printf( " %p", __kmp_threads[ gtid ] );
3258  }; // if
3259  if ( __kmp_root != NULL ) {
3260  __kmp_printf( " %p", __kmp_root[ gtid ] );
3261  }; // if
3262  __kmp_printf( "\n" );
3263  }; // for gtid
3264  }
3265 
3266  // Print out __kmp_threads array.
3267  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3268  if ( __kmp_threads != NULL ) {
3269  int gtid;
3270  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3271  kmp_info_t const * thread = __kmp_threads[ gtid ];
3272  if ( thread != NULL ) {
3273  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3274  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
3275  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
3276  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
3277  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
3278  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
3279  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
3280  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
3281 #if OMP_40_ENABLED
3282  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
3283 #endif
3284  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
3285  __kmp_printf( "\n" );
3286  __kmp_print_structure_team_accum( list, thread->th.th_team );
3287  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3288  }; // if
3289  }; // for gtid
3290  } else {
3291  __kmp_printf( "Threads array is not allocated.\n" );
3292  }; // if
3293 
3294  // Print out __kmp_root array.
3295  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3296  if ( __kmp_root != NULL ) {
3297  int gtid;
3298  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3299  kmp_root_t const * root = __kmp_root[ gtid ];
3300  if ( root != NULL ) {
3301  __kmp_printf( "GTID %2d %p:\n", gtid, root );
3302  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
3303  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
3304  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
3305  __kmp_printf( " Active?: %2d\n", root->r.r_active );
3306  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
3307  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
3308  __kmp_printf( "\n" );
3309  __kmp_print_structure_team_accum( list, root->r.r_root_team );
3310  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3311  }; // if
3312  }; // for gtid
3313  } else {
3314  __kmp_printf( "Ubers array is not allocated.\n" );
3315  }; // if
3316 
3317  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3318  while ( list->next != NULL ) {
3319  kmp_team_p const * team = list->entry;
3320  int i;
3321  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3322  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
3323  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
3324  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
3325  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
3326  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
3327  for ( i = 0; i < team->t.t_nproc; ++ i ) {
3328  __kmp_printf( " Thread %2d: ", i );
3329  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3330  }; // for i
3331  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
3332  __kmp_printf( "\n" );
3333  list = list->next;
3334  }; // while
3335 
3336  // Print out __kmp_thread_pool and __kmp_team_pool.
3337  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3338  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
3339  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
3340  __kmp_printf( "\n" );
3341 
3342  // Free team list.
3343  while ( list != NULL ) {
3344  kmp_team_list_item_t * item = list;
3345  list = list->next;
3346  KMP_INTERNAL_FREE( item );
3347  }; // while
3348 
3349 }
3350 
3351 #endif
3352 
3353 
3354 //---------------------------------------------------------------------------
3355 // Stuff for per-thread fast random number generator
3356 // Table of primes
3357 
3358 static const unsigned __kmp_primes[] = {
3359  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3360  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3361  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3362  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3363  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3364  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3365  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3366  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3367  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3368  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3369  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3370  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3371  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3372  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3373  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3374  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3375 };
3376 
3377 //---------------------------------------------------------------------------
3378 // __kmp_get_random: Get a random number using a linear congruential method.
3379 
3380 unsigned short
3381 __kmp_get_random( kmp_info_t * thread )
3382 {
3383  unsigned x = thread->th.th_x;
3384  unsigned short r = x>>16;
3385 
3386  thread->th.th_x = x*thread->th.th_a+1;
3387 
3388  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3389  thread->th.th_info.ds.ds_tid, r) );
3390 
3391  return r;
3392 }
3393 //--------------------------------------------------------
3394 // __kmp_init_random: Initialize a random number generator
3395 
3396 void
3397 __kmp_init_random( kmp_info_t * thread )
3398 {
3399  unsigned seed = thread->th.th_info.ds.ds_tid;
3400 
3401  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3402  thread->th.th_x = (seed+1)*thread->th.th_a+1;
3403  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3404 }
3405 
3406 
3407 #if KMP_OS_WINDOWS
3408 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3409 static int
3410 __kmp_reclaim_dead_roots(void) {
3411  int i, r = 0;
3412 
3413  for(i = 0; i < __kmp_threads_capacity; ++i) {
3414  if( KMP_UBER_GTID( i ) &&
3415  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3416  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3417  r += __kmp_unregister_root_other_thread(i);
3418  }
3419  }
3420  return r;
3421 }
3422 #endif
3423 
3424 /*
3425  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3426  free entries generated.
3427 
3428  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3429  already dead.
3430 
3431  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3432  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
3433  __kmp_tp_capacity, if threadprivate cache array has been created.
3434  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3435 
3436  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3437  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
3438  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3439  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
3440  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3441  as many free slots as possible up to nWish.
3442 
3443  If any argument is negative, the behavior is undefined.
3444 */
3445 static int
3446 __kmp_expand_threads(int nWish, int nNeed) {
3447  int added = 0;
3448  int old_tp_cached;
3449  int __kmp_actual_max_nth;
3450 
3451  if(nNeed > nWish) /* normalize the arguments */
3452  nWish = nNeed;
3453 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3454 /* only for Windows static library */
3455  /* reclaim array entries for root threads that are already dead */
3456  added = __kmp_reclaim_dead_roots();
3457 
3458  if(nNeed) {
3459  nNeed -= added;
3460  if(nNeed < 0)
3461  nNeed = 0;
3462  }
3463  if(nWish) {
3464  nWish -= added;
3465  if(nWish < 0)
3466  nWish = 0;
3467  }
3468 #endif
3469  if(nWish <= 0)
3470  return added;
3471 
3472  while(1) {
3473  int nTarget;
3474  int minimumRequiredCapacity;
3475  int newCapacity;
3476  kmp_info_t **newThreads;
3477  kmp_root_t **newRoot;
3478 
3479  //
3480  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3481  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3482  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3483  // become > __kmp_max_nth in one of two ways:
3484  //
3485  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3486  // may not be resused by another thread, so we may need to increase
3487  // __kmp_threads_capacity to __kmp_max_threads + 1.
3488  //
3489  // 2) New foreign root(s) are encountered. We always register new
3490  // foreign roots. This may cause a smaller # of threads to be
3491  // allocated at subsequent parallel regions, but the worker threads
3492  // hang around (and eventually go to sleep) and need slots in the
3493  // __kmp_threads[] array.
3494  //
3495  // Anyway, that is the reason for moving the check to see if
3496  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3497  // instead of having it performed here. -BB
3498  //
3499  old_tp_cached = __kmp_tp_cached;
3500  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3501  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3502 
3503  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3504  nTarget = nWish;
3505  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3506  /* can't fulfil nWish, so try nNeed */
3507  if(nNeed) {
3508  nTarget = nNeed;
3509  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3510  /* possible expansion too small -- give up */
3511  break;
3512  }
3513  } else {
3514  /* best-effort */
3515  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3516  if(!nTarget) {
3517  /* can expand at all -- give up */
3518  break;
3519  }
3520  }
3521  }
3522  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3523 
3524  newCapacity = __kmp_threads_capacity;
3525  do{
3526  newCapacity =
3527  newCapacity <= (__kmp_actual_max_nth >> 1) ?
3528  (newCapacity << 1) :
3529  __kmp_actual_max_nth;
3530  } while(newCapacity < minimumRequiredCapacity);
3531  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3532  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3533  KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3534  KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3535  memset(newThreads + __kmp_threads_capacity, 0,
3536  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3537  memset(newRoot + __kmp_threads_capacity, 0,
3538  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3539 
3540  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3541  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3542  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3543  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
3544  of a double-check pair.
3545  */
3546  __kmp_free(newThreads);
3547  continue; /* start over and try again */
3548  }
3549  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3550  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3551  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3552  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3553  __kmp_free(newThreads);
3554  continue; /* start over and try again */
3555  } else {
3556  /* success */
3557  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3558  //
3559  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3560  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3561  added += newCapacity - __kmp_threads_capacity;
3562  *(volatile int*)&__kmp_threads_capacity = newCapacity;
3563  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3564  break; /* succeeded, so we can exit the loop */
3565  }
3566  }
3567  return added;
3568 }
3569 
3570 /* register the current thread as a root thread and obtain our gtid */
3571 /* we must have the __kmp_initz_lock held at this point */
3572 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3573 int
3574 __kmp_register_root( int initial_thread )
3575 {
3576  kmp_info_t *root_thread;
3577  kmp_root_t *root;
3578  int gtid;
3579  int capacity;
3580  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3581  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3582  KMP_MB();
3583 
3584 
3585  /*
3586  2007-03-02:
3587 
3588  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3589  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3590  return false (that means there is at least one empty slot in __kmp_threads array), but it
3591  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3592  used for this one. Following code workarounds this bug.
3593 
3594  However, right solution seems to be not reserving slot #0 for initial thread because:
3595  (1) there is no magic in slot #0,
3596  (2) we cannot detect initial thread reliably (the first thread which does serial
3597  initialization may be not a real initial thread).
3598  */
3599  capacity = __kmp_threads_capacity;
3600  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3601  -- capacity;
3602  }; // if
3603 
3604  /* see if there are too many threads */
3605  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3606  if ( __kmp_tp_cached ) {
3607  __kmp_msg(
3608  kmp_ms_fatal,
3609  KMP_MSG( CantRegisterNewThread ),
3610  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3611  KMP_HNT( PossibleSystemLimitOnThreads ),
3612  __kmp_msg_null
3613  );
3614  }
3615  else {
3616  __kmp_msg(
3617  kmp_ms_fatal,
3618  KMP_MSG( CantRegisterNewThread ),
3619  KMP_HNT( SystemLimitOnThreads ),
3620  __kmp_msg_null
3621  );
3622  }
3623  }; // if
3624 
3625  /* find an available thread slot */
3626  /* Don't reassign the zero slot since we need that to only be used by initial
3627  thread */
3628  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3629  ;
3630  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3631  KMP_ASSERT( gtid < __kmp_threads_capacity );
3632 
3633  /* update global accounting */
3634  __kmp_all_nth ++;
3635  TCW_4(__kmp_nth, __kmp_nth + 1);
3636 
3637  //
3638  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3639  // for low numbers of procs, and method #2 (keyed API call) for higher
3640  // numbers of procs.
3641  //
3642  if ( __kmp_adjust_gtid_mode ) {
3643  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3644  if ( TCR_4(__kmp_gtid_mode) != 2) {
3645  TCW_4(__kmp_gtid_mode, 2);
3646  }
3647  }
3648  else {
3649  if (TCR_4(__kmp_gtid_mode) != 1 ) {
3650  TCW_4(__kmp_gtid_mode, 1);
3651  }
3652  }
3653  }
3654 
3655 #ifdef KMP_ADJUST_BLOCKTIME
3656  /* Adjust blocktime to zero if necessary */
3657  /* Middle initialization might not have occurred yet */
3658  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3659  if ( __kmp_nth > __kmp_avail_proc ) {
3660  __kmp_zero_bt = TRUE;
3661  }
3662  }
3663 #endif /* KMP_ADJUST_BLOCKTIME */
3664 
3665  /* setup this new hierarchy */
3666  if( ! ( root = __kmp_root[gtid] )) {
3667  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3668  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3669  }
3670 
3671  __kmp_initialize_root( root );
3672 
3673  /* setup new root thread structure */
3674  if( root->r.r_uber_thread ) {
3675  root_thread = root->r.r_uber_thread;
3676  } else {
3677  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3678  if ( __kmp_storage_map ) {
3679  __kmp_print_thread_storage_map( root_thread, gtid );
3680  }
3681  root_thread->th.th_info .ds.ds_gtid = gtid;
3682  root_thread->th.th_root = root;
3683  if( __kmp_env_consistency_check ) {
3684  root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3685  }
3686  #if USE_FAST_MEMORY
3687  __kmp_initialize_fast_memory( root_thread );
3688  #endif /* USE_FAST_MEMORY */
3689 
3690  #if KMP_USE_BGET
3691  KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3692  __kmp_initialize_bget( root_thread );
3693  #endif
3694  __kmp_init_random( root_thread ); // Initialize random number generator
3695  }
3696 
3697  /* setup the serial team held in reserve by the root thread */
3698  if( ! root_thread->th.th_serial_team ) {
3699  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3700  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3701 
3702  root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3703 #if OMPT_SUPPORT
3704  0, // root parallel id
3705 #endif
3706 #if OMP_40_ENABLED
3707  proc_bind_default,
3708 #endif
3709  &r_icvs,
3710  0 USE_NESTED_HOT_ARG(NULL) );
3711  }
3712  KMP_ASSERT( root_thread->th.th_serial_team );
3713  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3714  root_thread->th.th_serial_team ) );
3715 
3716  /* drop root_thread into place */
3717  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3718 
3719  root->r.r_root_team->t.t_threads[0] = root_thread;
3720  root->r.r_hot_team ->t.t_threads[0] = root_thread;
3721  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3722  root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3723  root->r.r_uber_thread = root_thread;
3724 
3725  /* initialize the thread, get it ready to go */
3726  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3727  TCW_4(__kmp_init_gtid, TRUE);
3728 
3729  /* prepare the master thread for get_gtid() */
3730  __kmp_gtid_set_specific( gtid );
3731 
3732 #if USE_ITT_BUILD
3733  __kmp_itt_thread_name( gtid );
3734 #endif /* USE_ITT_BUILD */
3735 
3736  #ifdef KMP_TDATA_GTID
3737  __kmp_gtid = gtid;
3738  #endif
3739  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3740  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3741 
3742  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3743  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3744  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3745  KMP_INIT_BARRIER_STATE ) );
3746  { // Initialize barrier data.
3747  int b;
3748  for ( b = 0; b < bs_last_barrier; ++ b ) {
3749  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3750 #if USE_DEBUGGER
3751  root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3752 #endif
3753  }; // for
3754  }
3755  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3756 
3757 #if KMP_AFFINITY_SUPPORTED
3758 # if OMP_40_ENABLED
3759  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3760  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3761  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3762  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3763 # endif
3764 
3765  if ( TCR_4(__kmp_init_middle) ) {
3766  __kmp_affinity_set_init_mask( gtid, TRUE );
3767  }
3768 #endif /* KMP_AFFINITY_SUPPORTED */
3769 
3770  __kmp_root_counter ++;
3771 
3772  KMP_MB();
3773  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3774 
3775  return gtid;
3776 }
3777 
3778 #if KMP_NESTED_HOT_TEAMS
3779 static int
3780 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3781 {
3782  int i, n, nth;
3783  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3784  if( !hot_teams || !hot_teams[level].hot_team ) {
3785  return 0;
3786  }
3787  KMP_DEBUG_ASSERT( level < max_level );
3788  kmp_team_t *team = hot_teams[level].hot_team;
3789  nth = hot_teams[level].hot_team_nth;
3790  n = nth - 1; // master is not freed
3791  if( level < max_level - 1 ) {
3792  for( i = 0; i < nth; ++i ) {
3793  kmp_info_t *th = team->t.t_threads[i];
3794  n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3795  if( i > 0 && th->th.th_hot_teams ) {
3796  __kmp_free( th->th.th_hot_teams );
3797  th->th.th_hot_teams = NULL;
3798  }
3799  }
3800  }
3801  __kmp_free_team( root, team, NULL );
3802  return n;
3803 }
3804 #endif
3805 
3806 /* Resets a root thread and clear its root and hot teams.
3807  Returns the number of __kmp_threads entries directly and indirectly freed.
3808 */
3809 static int
3810 __kmp_reset_root(int gtid, kmp_root_t *root)
3811 {
3812  kmp_team_t * root_team = root->r.r_root_team;
3813  kmp_team_t * hot_team = root->r.r_hot_team;
3814  int n = hot_team->t.t_nproc;
3815  int i;
3816 
3817  KMP_DEBUG_ASSERT( ! root->r.r_active );
3818 
3819  root->r.r_root_team = NULL;
3820  root->r.r_hot_team = NULL;
3821  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3822  // to __kmp_free_team().
3823  __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3824 #if KMP_NESTED_HOT_TEAMS
3825  if( __kmp_hot_teams_max_level > 0 ) { // need to free nested hot teams and their threads if any
3826  for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3827  kmp_info_t *th = hot_team->t.t_threads[i];
3828  if( __kmp_hot_teams_max_level > 1 ) {
3829  n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3830  }
3831  if( th->th.th_hot_teams ) {
3832  __kmp_free( th->th.th_hot_teams );
3833  th->th.th_hot_teams = NULL;
3834  }
3835  }
3836  }
3837 #endif
3838  __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3839 
3840  //
3841  // Before we can reap the thread, we need to make certain that all
3842  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3843  //
3844  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3845  __kmp_wait_to_unref_task_teams();
3846  }
3847 
3848  #if KMP_OS_WINDOWS
3849  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3850  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3851  (LPVOID)&(root->r.r_uber_thread->th),
3852  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3853  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3854  #endif /* KMP_OS_WINDOWS */
3855 
3856 #if OMPT_SUPPORT
3857  if (ompt_enabled &&
3858  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3859  int gtid = __kmp_get_gtid();
3860  __ompt_thread_end(ompt_thread_initial, gtid);
3861  }
3862 #endif
3863 
3864  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3865  __kmp_reap_thread( root->r.r_uber_thread, 1 );
3866 
3867  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3868  root->r.r_uber_thread = NULL;
3869  /* mark root as no longer in use */
3870  root->r.r_begin = FALSE;
3871 
3872  return n;
3873 }
3874 
3875 void
3876 __kmp_unregister_root_current_thread( int gtid )
3877 {
3878  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3879  /* this lock should be ok, since unregister_root_current_thread is never called during
3880  * and abort, only during a normal close. furthermore, if you have the
3881  * forkjoin lock, you should never try to get the initz lock */
3882 
3883  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3884  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3885  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3886  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3887  return;
3888  }
3889  kmp_root_t *root = __kmp_root[gtid];
3890 
3891  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3892  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3893  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3894  KMP_ASSERT( root->r.r_active == FALSE );
3895 
3896 
3897  KMP_MB();
3898 
3899 #if OMP_45_ENABLED
3900  kmp_info_t * thread = __kmp_threads[gtid];
3901  kmp_team_t * team = thread->th.th_team;
3902  kmp_task_team_t * task_team = thread->th.th_task_team;
3903 
3904  // we need to wait for the proxy tasks before finishing the thread
3905  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) {
3906 #if OMPT_SUPPORT
3907  // the runtime is shutting down so we won't report any events
3908  thread->th.ompt_thread_info.state = ompt_state_undefined;
3909 #endif
3910  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3911  }
3912 #endif
3913 
3914  __kmp_reset_root(gtid, root);
3915 
3916  /* free up this thread slot */
3917  __kmp_gtid_set_specific( KMP_GTID_DNE );
3918 #ifdef KMP_TDATA_GTID
3919  __kmp_gtid = KMP_GTID_DNE;
3920 #endif
3921 
3922  KMP_MB();
3923  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3924 
3925  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3926 }
3927 
3928 #if KMP_OS_WINDOWS
3929 /* __kmp_forkjoin_lock must be already held
3930  Unregisters a root thread that is not the current thread. Returns the number of
3931  __kmp_threads entries freed as a result.
3932  */
3933 static int
3934 __kmp_unregister_root_other_thread( int gtid )
3935 {
3936  kmp_root_t *root = __kmp_root[gtid];
3937  int r;
3938 
3939  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3940  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3941  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3942  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3943  KMP_ASSERT( root->r.r_active == FALSE );
3944 
3945  r = __kmp_reset_root(gtid, root);
3946  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
3947  return r;
3948 }
3949 #endif
3950 
3951 #if KMP_DEBUG
3952 void __kmp_task_info() {
3953 
3954  kmp_int32 gtid = __kmp_entry_gtid();
3955  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
3956  kmp_info_t *this_thr = __kmp_threads[ gtid ];
3957  kmp_team_t *steam = this_thr->th.th_serial_team;
3958  kmp_team_t *team = this_thr->th.th_team;
3959 
3960  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
3961  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
3962 }
3963 #endif // KMP_DEBUG
3964 
3965 /* TODO optimize with one big memclr, take out what isn't needed,
3966  * split responsibility to workers as much as possible, and delay
3967  * initialization of features as much as possible */
3968 static void
3969 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
3970 {
3971  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
3972  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3973  kmp_info_t *master = team->t.t_threads[0];
3974  KMP_DEBUG_ASSERT( this_thr != NULL );
3975  KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
3976  KMP_DEBUG_ASSERT( team );
3977  KMP_DEBUG_ASSERT( team->t.t_threads );
3978  KMP_DEBUG_ASSERT( team->t.t_dispatch );
3979  KMP_DEBUG_ASSERT( master );
3980  KMP_DEBUG_ASSERT( master->th.th_root );
3981 
3982  KMP_MB();
3983 
3984  TCW_SYNC_PTR(this_thr->th.th_team, team);
3985 
3986  this_thr->th.th_info.ds.ds_tid = tid;
3987  this_thr->th.th_set_nproc = 0;
3988 #if OMP_40_ENABLED
3989  this_thr->th.th_set_proc_bind = proc_bind_default;
3990 # if KMP_AFFINITY_SUPPORTED
3991  this_thr->th.th_new_place = this_thr->th.th_current_place;
3992 # endif
3993 #endif
3994  this_thr->th.th_root = master->th.th_root;
3995 
3996  /* setup the thread's cache of the team structure */
3997  this_thr->th.th_team_nproc = team->t.t_nproc;
3998  this_thr->th.th_team_master = master;
3999  this_thr->th.th_team_serialized = team->t.t_serialized;
4000  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4001 
4002  KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4003 
4004  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4005  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4006 
4007  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4008 
4009  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4010  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4011  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4012 
4013  /* TODO no worksharing in speculative threads */
4014  this_thr->th.th_dispatch = &team->t.t_dispatch[ tid ];
4015 
4016  this_thr->th.th_local.this_construct = 0;
4017 
4018 #ifdef BUILD_TV
4019  this_thr->th.th_local.tv_data = 0;
4020 #endif
4021 
4022  if ( ! this_thr->th.th_pri_common ) {
4023  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4024  if ( __kmp_storage_map ) {
4025  __kmp_print_storage_map_gtid(
4026  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4027  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4028  );
4029  }; // if
4030  this_thr->th.th_pri_head = NULL;
4031  }; // if
4032 
4033  /* Initialize dynamic dispatch */
4034  {
4035  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4036  /*
4037  * Use team max_nproc since this will never change for the team.
4038  */
4039  size_t disp_size = sizeof( dispatch_private_info_t ) *
4040  ( team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers );
4041  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4042  KMP_ASSERT( dispatch );
4043  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4044  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4045 
4046  dispatch->th_disp_index = 0;
4047 #if OMP_45_ENABLED
4048  dispatch->th_doacross_buf_idx = 0;
4049 #endif
4050  if( ! dispatch->th_disp_buffer ) {
4051  dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4052 
4053  if ( __kmp_storage_map ) {
4054  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4055  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers ],
4056  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4057  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4058  gtid, team->t.t_id, gtid );
4059  }
4060  } else {
4061  memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4062  }
4063 
4064  dispatch->th_dispatch_pr_current = 0;
4065  dispatch->th_dispatch_sh_current = 0;
4066 
4067  dispatch->th_deo_fcn = 0; /* ORDERED */
4068  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4069  }
4070 
4071  this_thr->th.th_next_pool = NULL;
4072 
4073  if (!this_thr->th.th_task_state_memo_stack) {
4074  size_t i;
4075  this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4076  this_thr->th.th_task_state_top = 0;
4077  this_thr->th.th_task_state_stack_sz = 4;
4078  for (i=0; i<this_thr->th.th_task_state_stack_sz; ++i) // zero init the stack
4079  this_thr->th.th_task_state_memo_stack[i] = 0;
4080  }
4081 
4082  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4083  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4084 
4085  KMP_MB();
4086 }
4087 
4088 
4089 /* allocate a new thread for the requesting team. this is only called from within a
4090  * forkjoin critical section. we will first try to get an available thread from the
4091  * thread pool. if none is available, we will fork a new one assuming we are able
4092  * to create a new one. this should be assured, as the caller should check on this
4093  * first.
4094  */
4095 kmp_info_t *
4096 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4097 {
4098  kmp_team_t *serial_team;
4099  kmp_info_t *new_thr;
4100  int new_gtid;
4101 
4102  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4103  KMP_DEBUG_ASSERT( root && team );
4104 #if !KMP_NESTED_HOT_TEAMS
4105  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4106 #endif
4107  KMP_MB();
4108 
4109  /* first, try to get one from the thread pool */
4110  if ( __kmp_thread_pool ) {
4111 
4112  new_thr = (kmp_info_t*)__kmp_thread_pool;
4113  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4114  if ( new_thr == __kmp_thread_pool_insert_pt ) {
4115  __kmp_thread_pool_insert_pt = NULL;
4116  }
4117  TCW_4(new_thr->th.th_in_pool, FALSE);
4118  //
4119  // Don't touch th_active_in_pool or th_active.
4120  // The worker thread adjusts those flags as it sleeps/awakens.
4121  //
4122  __kmp_thread_pool_nth--;
4123 
4124  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4125  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4126  KMP_ASSERT( ! new_thr->th.th_team );
4127  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4128  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4129 
4130  /* setup the thread structure */
4131  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4132  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4133 
4134  TCW_4(__kmp_nth, __kmp_nth + 1);
4135 
4136  new_thr->th.th_task_state = 0;
4137  new_thr->th.th_task_state_top = 0;
4138  new_thr->th.th_task_state_stack_sz = 4;
4139 
4140 #ifdef KMP_ADJUST_BLOCKTIME
4141  /* Adjust blocktime back to zero if necessar y */
4142  /* Middle initialization might not have occurred yet */
4143  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4144  if ( __kmp_nth > __kmp_avail_proc ) {
4145  __kmp_zero_bt = TRUE;
4146  }
4147  }
4148 #endif /* KMP_ADJUST_BLOCKTIME */
4149 
4150 #if KMP_DEBUG
4151  // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4152  int b;
4153  kmp_balign_t * balign = new_thr->th.th_bar;
4154  for( b = 0; b < bs_last_barrier; ++ b )
4155  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4156 #endif
4157 
4158  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4159  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4160 
4161  KMP_MB();
4162  return new_thr;
4163  }
4164 
4165 
4166  /* no, well fork a new one */
4167  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
4168  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4169 
4170  //
4171  // If this is the first worker thread the RTL is creating, then also
4172  // launch the monitor thread. We try to do this as early as possible.
4173  //
4174  if ( ! TCR_4( __kmp_init_monitor ) ) {
4175  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4176  if ( ! TCR_4( __kmp_init_monitor ) ) {
4177  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4178  TCW_4( __kmp_init_monitor, 1 );
4179  __kmp_create_monitor( & __kmp_monitor );
4180  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4181  #if KMP_OS_WINDOWS
4182  // AC: wait until monitor has started. This is a fix for CQ232808.
4183  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4184  // work in between, then there is high probability that monitor thread started after
4185  // the library shutdown. At shutdown it is too late to cope with the problem, because
4186  // when the master is in DllMain (process detach) the monitor has no chances to start
4187  // (it is blocked), and master has no means to inform the monitor that the library has gone,
4188  // because all the memory which the monitor can access is going to be released/reset.
4189  while ( TCR_4(__kmp_init_monitor) < 2 ) {
4190  KMP_YIELD( TRUE );
4191  }
4192  KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4193  #endif
4194  }
4195  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4196  }
4197 
4198  KMP_MB();
4199  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4200  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4201  }
4202 
4203  /* allocate space for it. */
4204  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4205 
4206  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4207 
4208  if ( __kmp_storage_map ) {
4209  __kmp_print_thread_storage_map( new_thr, new_gtid );
4210  }
4211 
4212  /* add the reserve serialized team, initialized from the team's master thread */
4213  {
4214  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4215  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4216 
4217  new_thr->th.th_serial_team = serial_team =
4218  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4219 #if OMPT_SUPPORT
4220  0, // root parallel id
4221 #endif
4222 #if OMP_40_ENABLED
4223  proc_bind_default,
4224 #endif
4225  &r_icvs,
4226  0 USE_NESTED_HOT_ARG(NULL) );
4227  }
4228  KMP_ASSERT ( serial_team );
4229  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
4230  serial_team->t.t_threads[0] = new_thr;
4231  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4232  new_thr ) );
4233 
4234  /* setup the thread structures */
4235  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4236 
4237  #if USE_FAST_MEMORY
4238  __kmp_initialize_fast_memory( new_thr );
4239  #endif /* USE_FAST_MEMORY */
4240 
4241  #if KMP_USE_BGET
4242  KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4243  __kmp_initialize_bget( new_thr );
4244  #endif
4245 
4246  __kmp_init_random( new_thr ); // Initialize random number generator
4247 
4248  /* Initialize these only once when thread is grabbed for a team allocation */
4249  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4250  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4251 
4252  int b;
4253  kmp_balign_t * balign = new_thr->th.th_bar;
4254  for(b=0; b<bs_last_barrier; ++b) {
4255  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4256  balign[b].bb.team = NULL;
4257  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4258  balign[b].bb.use_oncore_barrier = 0;
4259  }
4260 
4261  new_thr->th.th_spin_here = FALSE;
4262  new_thr->th.th_next_waiting = 0;
4263 
4264 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4265  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4266  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4267  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4268  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4269 #endif
4270 
4271  TCW_4(new_thr->th.th_in_pool, FALSE);
4272  new_thr->th.th_active_in_pool = FALSE;
4273  TCW_4(new_thr->th.th_active, TRUE);
4274 
4275  /* adjust the global counters */
4276  __kmp_all_nth ++;
4277  __kmp_nth ++;
4278 
4279  //
4280  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4281  // for low numbers of procs, and method #2 (keyed API call) for higher
4282  // numbers of procs.
4283  //
4284  if ( __kmp_adjust_gtid_mode ) {
4285  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4286  if ( TCR_4(__kmp_gtid_mode) != 2) {
4287  TCW_4(__kmp_gtid_mode, 2);
4288  }
4289  }
4290  else {
4291  if (TCR_4(__kmp_gtid_mode) != 1 ) {
4292  TCW_4(__kmp_gtid_mode, 1);
4293  }
4294  }
4295  }
4296 
4297 #ifdef KMP_ADJUST_BLOCKTIME
4298  /* Adjust blocktime back to zero if necessary */
4299  /* Middle initialization might not have occurred yet */
4300  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4301  if ( __kmp_nth > __kmp_avail_proc ) {
4302  __kmp_zero_bt = TRUE;
4303  }
4304  }
4305 #endif /* KMP_ADJUST_BLOCKTIME */
4306 
4307  /* actually fork it and create the new worker thread */
4308  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4309  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4310  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4311 
4312  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4313  KMP_MB();
4314  return new_thr;
4315 }
4316 
4317 /*
4318  * reinitialize team for reuse.
4319  *
4320  * The hot team code calls this case at every fork barrier, so EPCC barrier
4321  * test are extremely sensitive to changes in it, esp. writes to the team
4322  * struct, which cause a cache invalidation in all threads.
4323  *
4324  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4325  */
4326 static void
4327 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4328  KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4329  team->t.t_threads[0], team ) );
4330  KMP_DEBUG_ASSERT( team && new_icvs);
4331  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4332  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4333 
4334  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4335 
4336  // Copy ICVs to the master thread's implicit taskdata
4337  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4338  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4339 
4340  KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4341  team->t.t_threads[0], team ) );
4342 }
4343 
4344 
4345 /* initialize the team data structure
4346  * this assumes the t_threads and t_max_nproc are already set
4347  * also, we don't touch the arguments */
4348 static void
4349 __kmp_initialize_team(
4350  kmp_team_t * team,
4351  int new_nproc,
4352  kmp_internal_control_t * new_icvs,
4353  ident_t * loc
4354 ) {
4355  KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4356 
4357  /* verify */
4358  KMP_DEBUG_ASSERT( team );
4359  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4360  KMP_DEBUG_ASSERT( team->t.t_threads );
4361  KMP_MB();
4362 
4363  team->t.t_master_tid = 0; /* not needed */
4364  /* team->t.t_master_bar; not needed */
4365  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4366  team->t.t_nproc = new_nproc;
4367 
4368  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4369  team->t.t_next_pool = NULL;
4370  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4371 
4372  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4373  team->t.t_invoke = NULL; /* not needed */
4374 
4375  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4376  team->t.t_sched = new_icvs->sched;
4377 
4378 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4379  team->t.t_fp_control_saved = FALSE; /* not needed */
4380  team->t.t_x87_fpu_control_word = 0; /* not needed */
4381  team->t.t_mxcsr = 0; /* not needed */
4382 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4383 
4384  team->t.t_construct = 0;
4385  __kmp_init_lock( & team->t.t_single_lock );
4386 
4387  team->t.t_ordered .dt.t_value = 0;
4388  team->t.t_master_active = FALSE;
4389 
4390  memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4391 
4392 #ifdef KMP_DEBUG
4393  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4394 #endif
4395  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4396 
4397  team->t.t_control_stack_top = NULL;
4398 
4399  __kmp_reinitialize_team( team, new_icvs, loc );
4400 
4401  KMP_MB();
4402  KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4403 }
4404 
4405 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4406 /* Sets full mask for thread and returns old mask, no changes to structures. */
4407 static void
4408 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4409 {
4410  if ( KMP_AFFINITY_CAPABLE() ) {
4411  int status;
4412  if ( old_mask != NULL ) {
4413  status = __kmp_get_system_affinity( old_mask, TRUE );
4414  int error = errno;
4415  if ( status != 0 ) {
4416  __kmp_msg(
4417  kmp_ms_fatal,
4418  KMP_MSG( ChangeThreadAffMaskError ),
4419  KMP_ERR( error ),
4420  __kmp_msg_null
4421  );
4422  }
4423  }
4424  __kmp_set_system_affinity( __kmp_affin_fullMask, TRUE );
4425  }
4426 }
4427 #endif
4428 
4429 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4430 
4431 //
4432 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4433 // It calculats the worker + master thread's partition based upon the parent
4434 // thread's partition, and binds each worker to a thread in their partition.
4435 // The master thread's partition should already include its current binding.
4436 //
4437 static void
4438 __kmp_partition_places( kmp_team_t *team, int update_master_only )
4439 {
4440  //
4441  // Copy the master thread's place partion to the team struct
4442  //
4443  kmp_info_t *master_th = team->t.t_threads[0];
4444  KMP_DEBUG_ASSERT( master_th != NULL );
4445  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4446  int first_place = master_th->th.th_first_place;
4447  int last_place = master_th->th.th_last_place;
4448  int masters_place = master_th->th.th_current_place;
4449  team->t.t_first_place = first_place;
4450  team->t.t_last_place = last_place;
4451 
4452  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4453  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4454  masters_place, first_place, last_place ) );
4455 
4456  switch ( proc_bind ) {
4457 
4458  case proc_bind_default:
4459  //
4460  // serial teams might have the proc_bind policy set to
4461  // proc_bind_default. It doesn't matter, as we don't
4462  // rebind the master thread for any proc_bind policy.
4463  //
4464  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4465  break;
4466 
4467  case proc_bind_master:
4468  {
4469  int f;
4470  int n_th = team->t.t_nproc;
4471  for ( f = 1; f < n_th; f++ ) {
4472  kmp_info_t *th = team->t.t_threads[f];
4473  KMP_DEBUG_ASSERT( th != NULL );
4474  th->th.th_first_place = first_place;
4475  th->th.th_last_place = last_place;
4476  th->th.th_new_place = masters_place;
4477 
4478  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4479  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4480  team->t.t_id, f, masters_place, first_place, last_place ) );
4481  }
4482  }
4483  break;
4484 
4485  case proc_bind_close:
4486  {
4487  int f;
4488  int n_th = team->t.t_nproc;
4489  int n_places;
4490  if ( first_place <= last_place ) {
4491  n_places = last_place - first_place + 1;
4492  }
4493  else {
4494  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4495  }
4496  if ( n_th <= n_places ) {
4497  int place = masters_place;
4498  for ( f = 1; f < n_th; f++ ) {
4499  kmp_info_t *th = team->t.t_threads[f];
4500  KMP_DEBUG_ASSERT( th != NULL );
4501 
4502  if ( place == last_place ) {
4503  place = first_place;
4504  }
4505  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4506  place = 0;
4507  }
4508  else {
4509  place++;
4510  }
4511  th->th.th_first_place = first_place;
4512  th->th.th_last_place = last_place;
4513  th->th.th_new_place = place;
4514 
4515  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4516  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4517  team->t.t_id, f, place, first_place, last_place ) );
4518  }
4519  }
4520  else {
4521  int S, rem, gap, s_count;
4522  S = n_th / n_places;
4523  s_count = 0;
4524  rem = n_th - ( S * n_places );
4525  gap = rem > 0 ? n_places/rem : n_places;
4526  int place = masters_place;
4527  int gap_ct = gap;
4528  for ( f = 0; f < n_th; f++ ) {
4529  kmp_info_t *th = team->t.t_threads[f];
4530  KMP_DEBUG_ASSERT( th != NULL );
4531 
4532  th->th.th_first_place = first_place;
4533  th->th.th_last_place = last_place;
4534  th->th.th_new_place = place;
4535  s_count++;
4536 
4537  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4538  // do nothing, add an extra thread to place on next iteration
4539  }
4540  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4541  // we added an extra thread to this place; move to next place
4542  if ( place == last_place ) {
4543  place = first_place;
4544  }
4545  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4546  place = 0;
4547  }
4548  else {
4549  place++;
4550  }
4551  s_count = 0;
4552  gap_ct = 1;
4553  rem--;
4554  }
4555  else if (s_count == S) { // place full; don't add extra
4556  if ( place == last_place ) {
4557  place = first_place;
4558  }
4559  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4560  place = 0;
4561  }
4562  else {
4563  place++;
4564  }
4565  gap_ct++;
4566  s_count = 0;
4567  }
4568 
4569  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4570  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4571  team->t.t_id, f, th->th.th_new_place, first_place,
4572  last_place ) );
4573  }
4574  KMP_DEBUG_ASSERT( place == masters_place );
4575  }
4576  }
4577  break;
4578 
4579  case proc_bind_spread:
4580  {
4581  int f;
4582  int n_th = team->t.t_nproc;
4583  int n_places;
4584  int thidx;
4585  if ( first_place <= last_place ) {
4586  n_places = last_place - first_place + 1;
4587  }
4588  else {
4589  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4590  }
4591  if ( n_th <= n_places ) {
4592  int place = masters_place;
4593  int S = n_places/n_th;
4594  int s_count, rem, gap, gap_ct;
4595  rem = n_places - n_th*S;
4596  gap = rem ? n_th/rem : 1;
4597  gap_ct = gap;
4598  thidx = n_th;
4599  if (update_master_only == 1)
4600  thidx = 1;
4601  for ( f = 0; f < thidx; f++ ) {
4602  kmp_info_t *th = team->t.t_threads[f];
4603  KMP_DEBUG_ASSERT( th != NULL );
4604 
4605  th->th.th_first_place = place;
4606  th->th.th_new_place = place;
4607  s_count = 1;
4608  while (s_count < S) {
4609  if ( place == last_place ) {
4610  place = first_place;
4611  }
4612  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4613  place = 0;
4614  }
4615  else {
4616  place++;
4617  }
4618  s_count++;
4619  }
4620  if (rem && (gap_ct == gap)) {
4621  if ( place == last_place ) {
4622  place = first_place;
4623  }
4624  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4625  place = 0;
4626  }
4627  else {
4628  place++;
4629  }
4630  rem--;
4631  gap_ct = 0;
4632  }
4633  th->th.th_last_place = place;
4634  gap_ct++;
4635 
4636  if ( place == last_place ) {
4637  place = first_place;
4638  }
4639  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4640  place = 0;
4641  }
4642  else {
4643  place++;
4644  }
4645 
4646  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4647  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4648  team->t.t_id, f, th->th.th_new_place,
4649  th->th.th_first_place, th->th.th_last_place ) );
4650  }
4651  KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4652  }
4653  else {
4654  int S, rem, gap, s_count;
4655  S = n_th / n_places;
4656  s_count = 0;
4657  rem = n_th - ( S * n_places );
4658  gap = rem > 0 ? n_places/rem : n_places;
4659  int place = masters_place;
4660  int gap_ct = gap;
4661  thidx = n_th;
4662  if (update_master_only == 1)
4663  thidx = 1;
4664  for ( f = 0; f < thidx; f++ ) {
4665  kmp_info_t *th = team->t.t_threads[f];
4666  KMP_DEBUG_ASSERT( th != NULL );
4667 
4668  th->th.th_first_place = place;
4669  th->th.th_last_place = place;
4670  th->th.th_new_place = place;
4671  s_count++;
4672 
4673  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4674  // do nothing, add an extra thread to place on next iteration
4675  }
4676  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4677  // we added an extra thread to this place; move on to next place
4678  if ( place == last_place ) {
4679  place = first_place;
4680  }
4681  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4682  place = 0;
4683  }
4684  else {
4685  place++;
4686  }
4687  s_count = 0;
4688  gap_ct = 1;
4689  rem--;
4690  }
4691  else if (s_count == S) { // place is full; don't add extra thread
4692  if ( place == last_place ) {
4693  place = first_place;
4694  }
4695  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4696  place = 0;
4697  }
4698  else {
4699  place++;
4700  }
4701  gap_ct++;
4702  s_count = 0;
4703  }
4704 
4705  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4706  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4707  team->t.t_id, f, th->th.th_new_place,
4708  th->th.th_first_place, th->th.th_last_place) );
4709  }
4710  KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4711  }
4712  }
4713  break;
4714 
4715  default:
4716  break;
4717  }
4718 
4719  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4720 }
4721 
4722 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4723 
4724 /* allocate a new team data structure to use. take one off of the free pool if available */
4725 kmp_team_t *
4726 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4727 #if OMPT_SUPPORT
4728  ompt_parallel_id_t ompt_parallel_id,
4729 #endif
4730 #if OMP_40_ENABLED
4731  kmp_proc_bind_t new_proc_bind,
4732 #endif
4733  kmp_internal_control_t *new_icvs,
4734  int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4735 {
4736  KMP_TIME_DEVELOPER_BLOCK(KMP_allocate_team);
4737  int f;
4738  kmp_team_t *team;
4739  int use_hot_team = ! root->r.r_active;
4740  int level = 0;
4741 
4742  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4743  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4744  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4745  KMP_MB();
4746 
4747 #if KMP_NESTED_HOT_TEAMS
4748  kmp_hot_team_ptr_t *hot_teams;
4749  if( master ) {
4750  team = master->th.th_team;
4751  level = team->t.t_active_level;
4752  if( master->th.th_teams_microtask ) { // in teams construct?
4753  if( master->th.th_teams_size.nteams > 1 && ( // #teams > 1
4754  team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4755  master->th.th_teams_level < team->t.t_level ) ) { // or nested parallel inside the teams
4756  ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4757  }
4758  }
4759  hot_teams = master->th.th_hot_teams;
4760  if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4761  { // hot team has already been allocated for given level
4762  use_hot_team = 1;
4763  } else {
4764  use_hot_team = 0;
4765  }
4766  }
4767 #endif
4768  // Optimization to use a "hot" team
4769  if( use_hot_team && new_nproc > 1 ) {
4770  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4771 #if KMP_NESTED_HOT_TEAMS
4772  team = hot_teams[level].hot_team;
4773 #else
4774  team = root->r.r_hot_team;
4775 #endif
4776 #if KMP_DEBUG
4777  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4778  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4779  team->t.t_task_team[0], team->t.t_task_team[1] ));
4780  }
4781 #endif
4782 
4783  // Has the number of threads changed?
4784  /* Let's assume the most common case is that the number of threads is unchanged, and
4785  put that case first. */
4786  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4787  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4788  // This case can mean that omp_set_num_threads() was called and the hot team size
4789  // was already reduced, so we check the special flag
4790  if ( team->t.t_size_changed == -1 ) {
4791  team->t.t_size_changed = 1;
4792  } else {
4793  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4794  }
4795 
4796  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4797  kmp_r_sched_t new_sched = new_icvs->sched;
4798  if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4799  team->t.t_sched.chunk != new_sched.chunk)
4800  team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
4801 
4802  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4803 
4804  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4805  0, team->t.t_threads[0], team ) );
4806  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4807 
4808 #if OMP_40_ENABLED
4809 # if KMP_AFFINITY_SUPPORTED
4810  if ( ( team->t.t_size_changed == 0 )
4811  && ( team->t.t_proc_bind == new_proc_bind ) ) {
4812  if (new_proc_bind == proc_bind_spread) {
4813  __kmp_partition_places(team, 1); // add flag to update only master for spread
4814  }
4815  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4816  team->t.t_id, new_proc_bind, team->t.t_first_place,
4817  team->t.t_last_place ) );
4818  }
4819  else {
4820  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4821  __kmp_partition_places( team );
4822  }
4823 # else
4824  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4825 # endif /* KMP_AFFINITY_SUPPORTED */
4826 #endif /* OMP_40_ENABLED */
4827  }
4828  else if( team->t.t_nproc > new_nproc ) {
4829  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4830 
4831  team->t.t_size_changed = 1;
4832 #if KMP_NESTED_HOT_TEAMS
4833  if( __kmp_hot_teams_mode == 0 ) {
4834  // AC: saved number of threads should correspond to team's value in this mode,
4835  // can be bigger in mode 1, when hot team has some threads in reserve
4836  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4837  hot_teams[level].hot_team_nth = new_nproc;
4838 #endif // KMP_NESTED_HOT_TEAMS
4839  /* release the extra threads we don't need any more */
4840  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
4841  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4842  if ( __kmp_tasking_mode != tskm_immediate_exec) {
4843  // When decreasing team size, threads no longer in the team should unref task team.
4844  team->t.t_threads[f]->th.th_task_team = NULL;
4845  }
4846  __kmp_free_thread( team->t.t_threads[ f ] );
4847  team->t.t_threads[ f ] = NULL;
4848  }
4849 #if KMP_NESTED_HOT_TEAMS
4850  } // (__kmp_hot_teams_mode == 0)
4851 #endif // KMP_NESTED_HOT_TEAMS
4852  team->t.t_nproc = new_nproc;
4853  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4854  if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4855  team->t.t_sched.chunk != new_icvs->sched.chunk)
4856  team->t.t_sched = new_icvs->sched;
4857  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4858 
4859  /* update the remaining threads */
4860  for(f = 0; f < new_nproc; ++f) {
4861  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4862  }
4863  // restore the current task state of the master thread: should be the implicit task
4864  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4865  0, team->t.t_threads[0], team ) );
4866 
4867  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4868 
4869 #ifdef KMP_DEBUG
4870  for ( f = 0; f < team->t.t_nproc; f++ ) {
4871  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4872  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4873  }
4874 #endif
4875 
4876 #if OMP_40_ENABLED
4877  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4878 # if KMP_AFFINITY_SUPPORTED
4879  __kmp_partition_places( team );
4880 # endif
4881 #endif
4882  }
4883  else { // team->t.t_nproc < new_nproc
4884 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4885  kmp_affin_mask_t *old_mask;
4886  if ( KMP_AFFINITY_CAPABLE() ) {
4887  KMP_CPU_ALLOC(old_mask);
4888  }
4889 #endif
4890 
4891  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4892 
4893  team->t.t_size_changed = 1;
4894 
4895 #if KMP_NESTED_HOT_TEAMS
4896  int avail_threads = hot_teams[level].hot_team_nth;
4897  if( new_nproc < avail_threads )
4898  avail_threads = new_nproc;
4899  kmp_info_t **other_threads = team->t.t_threads;
4900  for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4901  // Adjust barrier data of reserved threads (if any) of the team
4902  // Other data will be set in __kmp_initialize_info() below.
4903  int b;
4904  kmp_balign_t * balign = other_threads[f]->th.th_bar;
4905  for ( b = 0; b < bs_last_barrier; ++ b ) {
4906  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4907  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4908 #if USE_DEBUGGER
4909  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4910 #endif
4911  }
4912  }
4913  if( hot_teams[level].hot_team_nth >= new_nproc ) {
4914  // we have all needed threads in reserve, no need to allocate any
4915  // this only possible in mode 1, cannot have reserved threads in mode 0
4916  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4917  team->t.t_nproc = new_nproc; // just get reserved threads involved
4918  } else {
4919  // we may have some threads in reserve, but not enough
4920  team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4921  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
4922 #endif // KMP_NESTED_HOT_TEAMS
4923  if(team->t.t_max_nproc < new_nproc) {
4924  /* reallocate larger arrays */
4925  __kmp_reallocate_team_arrays(team, new_nproc);
4926  __kmp_reinitialize_team( team, new_icvs, NULL );
4927  }
4928 
4929 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4930  /* Temporarily set full mask for master thread before
4931  creation of workers. The reason is that workers inherit
4932  the affinity from master, so if a lot of workers are
4933  created on the single core quickly, they don't get
4934  a chance to set their own affinity for a long time.
4935  */
4936  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4937 #endif
4938 
4939  /* allocate new threads for the hot team */
4940  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
4941  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4942  KMP_DEBUG_ASSERT( new_worker );
4943  team->t.t_threads[ f ] = new_worker;
4944 
4945  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%llu, plain=%llu\n",
4946  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
4947  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4948  team->t.t_bar[bs_plain_barrier].b_arrived ) );
4949 
4950  { // Initialize barrier data for new threads.
4951  int b;
4952  kmp_balign_t * balign = new_worker->th.th_bar;
4953  for( b = 0; b < bs_last_barrier; ++ b ) {
4954  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
4955  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4956 #if USE_DEBUGGER
4957  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
4958 #endif
4959  }
4960  }
4961  }
4962 
4963 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4964  if ( KMP_AFFINITY_CAPABLE() ) {
4965  /* Restore initial master thread's affinity mask */
4966  __kmp_set_system_affinity( old_mask, TRUE );
4967  KMP_CPU_FREE(old_mask);
4968  }
4969 #endif
4970 #if KMP_NESTED_HOT_TEAMS
4971  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
4972 #endif // KMP_NESTED_HOT_TEAMS
4973  /* make sure everyone is syncronized */
4974  int old_nproc = team->t.t_nproc; // save old value and use to update only new threads below
4975  __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
4976 
4977  /* reinitialize the threads */
4978  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
4979  for (f=0; f < team->t.t_nproc; ++f)
4980  __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
4981  if (level) { // set th_task_state for new threads in nested hot team
4982  // __kmp_initialize_info() no longer zeroes th_task_state, so we should only need to set the
4983  // th_task_state for the new threads. th_task_state for master thread will not be accurate until
4984  // after this in __kmp_fork_call(), so we look to the master's memo_stack to get the correct value.
4985  for (f=old_nproc; f < team->t.t_nproc; ++f)
4986  team->t.t_threads[f]->th.th_task_state = team->t.t_threads[0]->th.th_task_state_memo_stack[level];
4987  }
4988  else { // set th_task_state for new threads in non-nested hot team
4989  int old_state = team->t.t_threads[0]->th.th_task_state; // copy master's state
4990  for (f=old_nproc; f < team->t.t_nproc; ++f)
4991  team->t.t_threads[f]->th.th_task_state = old_state;
4992  }
4993 
4994 #ifdef KMP_DEBUG
4995  for ( f = 0; f < team->t.t_nproc; ++ f ) {
4996  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4997  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4998  }
4999 #endif
5000 
5001 #if OMP_40_ENABLED
5002  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5003 # if KMP_AFFINITY_SUPPORTED
5004  __kmp_partition_places( team );
5005 # endif
5006 #endif
5007  } // Check changes in number of threads
5008 
5009 #if OMP_40_ENABLED
5010  kmp_info_t *master = team->t.t_threads[0];
5011  if( master->th.th_teams_microtask ) {
5012  for( f = 1; f < new_nproc; ++f ) {
5013  // propagate teams construct specific info to workers
5014  kmp_info_t *thr = team->t.t_threads[f];
5015  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5016  thr->th.th_teams_level = master->th.th_teams_level;
5017  thr->th.th_teams_size = master->th.th_teams_size;
5018  }
5019  }
5020 #endif /* OMP_40_ENABLED */
5021 #if KMP_NESTED_HOT_TEAMS
5022  if( level ) {
5023  // Sync barrier state for nested hot teams, not needed for outermost hot team.
5024  for( f = 1; f < new_nproc; ++f ) {
5025  kmp_info_t *thr = team->t.t_threads[f];
5026  int b;
5027  kmp_balign_t * balign = thr->th.th_bar;
5028  for( b = 0; b < bs_last_barrier; ++ b ) {
5029  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5030  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5031 #if USE_DEBUGGER
5032  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5033 #endif
5034  }
5035  }
5036  }
5037 #endif // KMP_NESTED_HOT_TEAMS
5038 
5039  /* reallocate space for arguments if necessary */
5040  __kmp_alloc_argv_entries( argc, team, TRUE );
5041  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5042  //
5043  // The hot team re-uses the previous task team,
5044  // if untouched during the previous release->gather phase.
5045  //
5046 
5047  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5048 
5049 #if KMP_DEBUG
5050  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5051  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5052  team->t.t_task_team[0], team->t.t_task_team[1] ));
5053  }
5054 #endif
5055 
5056 #if OMPT_SUPPORT
5057  __ompt_team_assign_id(team, ompt_parallel_id);
5058 #endif
5059 
5060  KMP_MB();
5061 
5062  return team;
5063  }
5064 
5065  /* next, let's try to take one from the team pool */
5066  KMP_MB();
5067  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5068  {
5069  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5070  if ( team->t.t_max_nproc >= max_nproc ) {
5071  /* take this team from the team pool */
5072  __kmp_team_pool = team->t.t_next_pool;
5073 
5074  /* setup the team for fresh use */
5075  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5076 
5077  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5078  &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5079  team->t.t_task_team[0] = NULL;
5080  team->t.t_task_team[1] = NULL;
5081 
5082  /* reallocate space for arguments if necessary */
5083  __kmp_alloc_argv_entries( argc, team, TRUE );
5084  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5085 
5086  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5087  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5088  { // Initialize barrier data.
5089  int b;
5090  for ( b = 0; b < bs_last_barrier; ++ b) {
5091  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5092 #if USE_DEBUGGER
5093  team->t.t_bar[ b ].b_master_arrived = 0;
5094  team->t.t_bar[ b ].b_team_arrived = 0;
5095 #endif
5096  }
5097  }
5098 
5099 #if OMP_40_ENABLED
5100  team->t.t_proc_bind = new_proc_bind;
5101 #endif
5102 
5103  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5104 
5105 #if OMPT_SUPPORT
5106  __ompt_team_assign_id(team, ompt_parallel_id);
5107 #endif
5108 
5109  KMP_MB();
5110 
5111  return team;
5112  }
5113 
5114  /* reap team if it is too small, then loop back and check the next one */
5115  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5116  /* TODO: Use technique to find the right size hot-team, don't reap them */
5117  team = __kmp_reap_team( team );
5118  __kmp_team_pool = team;
5119  }
5120 
5121  /* nothing available in the pool, no matter, make a new team! */
5122  KMP_MB();
5123  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5124 
5125  /* and set it up */
5126  team->t.t_max_nproc = max_nproc;
5127  /* NOTE well, for some reason allocating one big buffer and dividing it
5128  * up seems to really hurt performance a lot on the P4, so, let's not use
5129  * this... */
5130  __kmp_allocate_team_arrays( team, max_nproc );
5131 
5132  KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5133  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5134 
5135  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5136  &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5137  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5138  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5139 
5140  if ( __kmp_storage_map ) {
5141  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5142  }
5143 
5144  /* allocate space for arguments */
5145  __kmp_alloc_argv_entries( argc, team, FALSE );
5146  team->t.t_argc = argc;
5147 
5148  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5149  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5150  { // Initialize barrier data.
5151  int b;
5152  for ( b = 0; b < bs_last_barrier; ++ b ) {
5153  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5154 #if USE_DEBUGGER
5155  team->t.t_bar[ b ].b_master_arrived = 0;
5156  team->t.t_bar[ b ].b_team_arrived = 0;
5157 #endif
5158  }
5159  }
5160 
5161 #if OMP_40_ENABLED
5162  team->t.t_proc_bind = new_proc_bind;
5163 #endif
5164 
5165 #if OMPT_SUPPORT
5166  __ompt_team_assign_id(team, ompt_parallel_id);
5167  team->t.ompt_serialized_team_info = NULL;
5168 #endif
5169 
5170  KMP_MB();
5171 
5172  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5173 
5174  return team;
5175 }
5176 
5177 /* TODO implement hot-teams at all levels */
5178 /* TODO implement lazy thread release on demand (disband request) */
5179 
5180 /* free the team. return it to the team pool. release all the threads
5181  * associated with it */
5182 void
5183 __kmp_free_team( kmp_root_t *root, kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master) )
5184 {
5185  int f;
5186  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5187 
5188  /* verify state */
5189  KMP_DEBUG_ASSERT( root );
5190  KMP_DEBUG_ASSERT( team );
5191  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5192  KMP_DEBUG_ASSERT( team->t.t_threads );
5193 
5194  int use_hot_team = team == root->r.r_hot_team;
5195 #if KMP_NESTED_HOT_TEAMS
5196  int level;
5197  kmp_hot_team_ptr_t *hot_teams;
5198  if( master ) {
5199  level = team->t.t_active_level - 1;
5200  if( master->th.th_teams_microtask ) { // in teams construct?
5201  if( master->th.th_teams_size.nteams > 1 ) {
5202  ++level; // level was not increased in teams construct for team_of_masters
5203  }
5204  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5205  master->th.th_teams_level == team->t.t_level ) {
5206  ++level; // level was not increased in teams construct for team_of_workers before the parallel
5207  } // team->t.t_level will be increased inside parallel
5208  }
5209  hot_teams = master->th.th_hot_teams;
5210  if( level < __kmp_hot_teams_max_level ) {
5211  KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5212  use_hot_team = 1;
5213  }
5214  }
5215 #endif // KMP_NESTED_HOT_TEAMS
5216 
5217  /* team is done working */
5218  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5219  team->t.t_copyin_counter = 0; // init counter for possible reuse
5220  // Do not reset pointer to parent team to NULL for hot teams.
5221 
5222  /* if we are non-hot team, release our threads */
5223  if( ! use_hot_team ) {
5224  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5225  // Delete task teams
5226  int tt_idx;
5227  for (tt_idx=0; tt_idx<2; ++tt_idx) {
5228  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5229  if ( task_team != NULL ) {
5230  for (f=0; f<team->t.t_nproc; ++f) { // Have all threads unref task teams
5231  team->t.t_threads[f]->th.th_task_team = NULL;
5232  }
5233  KA_TRACE( 20, ( "__kmp_free_team: T#%d deactivating task_team %p on team %d\n", __kmp_get_gtid(), task_team, team->t.t_id ) );
5234 #if KMP_NESTED_HOT_TEAMS
5235  __kmp_free_task_team( master, task_team );
5236 #endif
5237  team->t.t_task_team[tt_idx] = NULL;
5238  }
5239  }
5240  }
5241 
5242  // Reset pointer to parent team only for non-hot teams.
5243  team->t.t_parent = NULL;
5244  team->t.t_level = 0;
5245  team->t.t_active_level = 0;
5246 
5247  /* free the worker threads */
5248  for ( f = 1; f < team->t.t_nproc; ++ f ) {
5249  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5250  __kmp_free_thread( team->t.t_threads[ f ] );
5251  team->t.t_threads[ f ] = NULL;
5252  }
5253 
5254  /* put the team back in the team pool */
5255  /* TODO limit size of team pool, call reap_team if pool too large */
5256  team->t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
5257  __kmp_team_pool = (volatile kmp_team_t*) team;
5258  }
5259 
5260  KMP_MB();
5261 }
5262 
5263 
5264 /* reap the team. destroy it, reclaim all its resources and free its memory */
5265 kmp_team_t *
5266 __kmp_reap_team( kmp_team_t *team )
5267 {
5268  kmp_team_t *next_pool = team->t.t_next_pool;
5269 
5270  KMP_DEBUG_ASSERT( team );
5271  KMP_DEBUG_ASSERT( team->t.t_dispatch );
5272  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5273  KMP_DEBUG_ASSERT( team->t.t_threads );
5274  KMP_DEBUG_ASSERT( team->t.t_argv );
5275 
5276  /* TODO clean the threads that are a part of this? */
5277 
5278  /* free stuff */
5279 
5280  __kmp_free_team_arrays( team );
5281  if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5282  __kmp_free( (void*) team->t.t_argv );
5283  __kmp_free( team );
5284 
5285  KMP_MB();
5286  return next_pool;
5287 }
5288 
5289 //
5290 // Free the thread. Don't reap it, just place it on the pool of available
5291 // threads.
5292 //
5293 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5294 // binding for the affinity mechanism to be useful.
5295 //
5296 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5297 // However, we want to avoid a potential performance problem by always
5298 // scanning through the list to find the correct point at which to insert
5299 // the thread (potential N**2 behavior). To do this we keep track of the
5300 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5301 // With single-level parallelism, threads will always be added to the tail
5302 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5303 // parallelism, all bets are off and we may need to scan through the entire
5304 // free list.
5305 //
5306 // This change also has a potentially large performance benefit, for some
5307 // applications. Previously, as threads were freed from the hot team, they
5308 // would be placed back on the free list in inverse order. If the hot team
5309 // grew back to it's original size, then the freed thread would be placed
5310 // back on the hot team in reverse order. This could cause bad cache
5311 // locality problems on programs where the size of the hot team regularly
5312 // grew and shrunk.
5313 //
5314 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5315 //
5316 void
5317 __kmp_free_thread( kmp_info_t *this_th )
5318 {
5319  int gtid;
5320  kmp_info_t **scan;
5321 
5322  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5323  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5324 
5325  KMP_DEBUG_ASSERT( this_th );
5326 
5327  // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5328  int b;
5329  kmp_balign_t *balign = this_th->th.th_bar;
5330  for (b=0; b<bs_last_barrier; ++b) {
5331  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5332  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5333  balign[b].bb.team = NULL;
5334  }
5335  this_th->th.th_task_state = 0;
5336 
5337  /* put thread back on the free pool */
5338  TCW_PTR(this_th->th.th_team, NULL);
5339  TCW_PTR(this_th->th.th_root, NULL);
5340  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5341 
5342  //
5343  // If the __kmp_thread_pool_insert_pt is already past the new insert
5344  // point, then we need to re-scan the entire list.
5345  //
5346  gtid = this_th->th.th_info.ds.ds_gtid;
5347  if ( __kmp_thread_pool_insert_pt != NULL ) {
5348  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5349  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5350  __kmp_thread_pool_insert_pt = NULL;
5351  }
5352  }
5353 
5354  //
5355  // Scan down the list to find the place to insert the thread.
5356  // scan is the address of a link in the list, possibly the address of
5357  // __kmp_thread_pool itself.
5358  //
5359  // In the absence of nested parallism, the for loop will have 0 iterations.
5360  //
5361  if ( __kmp_thread_pool_insert_pt != NULL ) {
5362  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5363  }
5364  else {
5365  scan = (kmp_info_t **)&__kmp_thread_pool;
5366  }
5367  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5368  scan = &( (*scan)->th.th_next_pool ) );
5369 
5370  //
5371  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5372  // to its address.
5373  //
5374  TCW_PTR(this_th->th.th_next_pool, *scan);
5375  __kmp_thread_pool_insert_pt = *scan = this_th;
5376  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5377  || ( this_th->th.th_info.ds.ds_gtid
5378  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5379  TCW_4(this_th->th.th_in_pool, TRUE);
5380  __kmp_thread_pool_nth++;
5381 
5382  TCW_4(__kmp_nth, __kmp_nth - 1);
5383 
5384 #ifdef KMP_ADJUST_BLOCKTIME
5385  /* Adjust blocktime back to user setting or default if necessary */
5386  /* Middle initialization might never have occurred */
5387  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5388  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5389  if ( __kmp_nth <= __kmp_avail_proc ) {
5390  __kmp_zero_bt = FALSE;
5391  }
5392  }
5393 #endif /* KMP_ADJUST_BLOCKTIME */
5394 
5395  KMP_MB();
5396 }
5397 
5398 
5399 /* ------------------------------------------------------------------------ */
5400 
5401 void *
5402 __kmp_launch_thread( kmp_info_t *this_thr )
5403 {
5404  int gtid = this_thr->th.th_info.ds.ds_gtid;
5405 /* void *stack_data;*/
5406  kmp_team_t *(*volatile pteam);
5407 
5408  KMP_MB();
5409  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5410 
5411  if( __kmp_env_consistency_check ) {
5412  this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
5413  }
5414 
5415 #if OMPT_SUPPORT
5416  if (ompt_enabled) {
5417  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5418  this_thr->th.ompt_thread_info.wait_id = 0;
5419  this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5420  if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5421  __ompt_thread_begin(ompt_thread_worker, gtid);
5422  }
5423  }
5424 #endif
5425 
5426  /* This is the place where threads wait for work */
5427  while( ! TCR_4(__kmp_global.g.g_done) ) {
5428  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5429  KMP_MB();
5430 
5431  /* wait for work to do */
5432  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5433 
5434 #if OMPT_SUPPORT
5435  if (ompt_enabled) {
5436  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5437  }
5438 #endif
5439 
5440  /* No tid yet since not part of a team */
5441  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5442 
5443 #if OMPT_SUPPORT
5444  if (ompt_enabled) {
5445  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5446  }
5447 #endif
5448 
5449  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5450 
5451  /* have we been allocated? */
5452  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5453 #if OMPT_SUPPORT
5454  ompt_task_info_t *task_info;
5455  ompt_parallel_id_t my_parallel_id;
5456  if (ompt_enabled) {
5457  task_info = __ompt_get_taskinfo(0);
5458  my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5459  }
5460 #endif
5461  /* we were just woken up, so run our new task */
5462  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5463  int rc;
5464  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5465  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5466 
5467  updateHWFPControl (*pteam);
5468 
5469 #if OMPT_SUPPORT
5470  if (ompt_enabled) {
5471  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5472  // Initialize OMPT task id for implicit task.
5473  int tid = __kmp_tid_from_gtid(gtid);
5474  task_info->task_id = __ompt_task_id_new(tid);
5475  }
5476 #endif
5477 
5478  KMP_STOP_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
5479  {
5480  KMP_TIME_DEVELOPER_BLOCK(USER_worker_invoke);
5481  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5482  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5483  rc = (*pteam)->t.t_invoke( gtid );
5484  }
5485  KMP_START_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
5486  KMP_ASSERT( rc );
5487 
5488 #if OMPT_SUPPORT
5489  if (ompt_enabled) {
5490  /* no frame set while outside task */
5491  task_info->frame.exit_runtime_frame = 0;
5492 
5493  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5494  }
5495 #endif
5496  KMP_MB();
5497  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5498  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5499  }
5500  /* join barrier after parallel region */
5501  __kmp_join_barrier( gtid );
5502 #if OMPT_SUPPORT && OMPT_TRACE
5503  if (ompt_enabled) {
5504  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5505  // don't access *pteam here: it may have already been freed
5506  // by the master thread behind the barrier (possible race)
5507  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5508  my_parallel_id, task_info->task_id);
5509  }
5510  task_info->frame.exit_runtime_frame = 0;
5511  task_info->task_id = 0;
5512  }
5513 #endif
5514  }
5515  }
5516  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5517 
5518 #if OMPT_SUPPORT
5519  if (ompt_enabled &&
5520  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5521  __ompt_thread_end(ompt_thread_worker, gtid);
5522  }
5523 #endif
5524 
5525  this_thr->th.th_task_team = NULL;
5526  /* run the destructors for the threadprivate data for this thread */
5527  __kmp_common_destroy_gtid( gtid );
5528 
5529  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5530  KMP_MB();
5531  return this_thr;
5532 }
5533 
5534 /* ------------------------------------------------------------------------ */
5535 /* ------------------------------------------------------------------------ */
5536 
5537 void
5538 __kmp_internal_end_dest( void *specific_gtid )
5539 {
5540  #if KMP_COMPILER_ICC
5541  #pragma warning( push )
5542  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
5543  #endif
5544  // Make sure no significant bits are lost
5545  int gtid = (kmp_intptr_t)specific_gtid - 1;
5546  #if KMP_COMPILER_ICC
5547  #pragma warning( pop )
5548  #endif
5549 
5550  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5551  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5552  * this is because 0 is reserved for the nothing-stored case */
5553 
5554  /* josh: One reason for setting the gtid specific data even when it is being
5555  destroyed by pthread is to allow gtid lookup through thread specific data
5556  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5557  that gets executed in the call to __kmp_internal_end_thread, actually
5558  gets the gtid through the thread specific data. Setting it here seems
5559  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5560  to run smoothly.
5561  todo: get rid of this after we remove the dependence on
5562  __kmp_gtid_get_specific
5563  */
5564  if(gtid >= 0 && KMP_UBER_GTID(gtid))
5565  __kmp_gtid_set_specific( gtid );
5566  #ifdef KMP_TDATA_GTID
5567  __kmp_gtid = gtid;
5568  #endif
5569  __kmp_internal_end_thread( gtid );
5570 }
5571 
5572 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5573 
5574 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5575 // perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
5576 // option in makefile.mk works fine.
5577 
5578 __attribute__(( destructor ))
5579 void
5580 __kmp_internal_end_dtor( void )
5581 {
5582  __kmp_internal_end_atexit();
5583 }
5584 
5585 void
5586 __kmp_internal_end_fini( void )
5587 {
5588  __kmp_internal_end_atexit();
5589 }
5590 
5591 #endif
5592 
5593 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5594 void
5595 __kmp_internal_end_atexit( void )
5596 {
5597  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5598  /* [Windows]
5599  josh: ideally, we want to completely shutdown the library in this atexit handler, but
5600  stat code that depends on thread specific data for gtid fails because that data becomes
5601  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5602  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
5603  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5604 
5605 // TODO: Can some of this comment about GVS be removed?
5606  I suspect that the offending stat code is executed when the calling thread tries to
5607  clean up a dead root thread's data structures, resulting in GVS code trying to close
5608  the GVS structures for that thread, but since the stat code uses
5609  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5610  cleaning up itself instead of another thread, it gets confused. This happens because
5611  allowing a thread to unregister and cleanup another thread is a recent modification for
5612  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
5613  thread may end up trying to unregister another thread only if thread death does not
5614  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
5615  specific data destructor function to detect thread death. For Windows dynamic, there
5616  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
5617  workaround is applicable only for Windows static stat library.
5618  */
5619  __kmp_internal_end_library( -1 );
5620  #if KMP_OS_WINDOWS
5621  __kmp_close_console();
5622  #endif
5623 }
5624 
5625 static void
5626 __kmp_reap_thread(
5627  kmp_info_t * thread,
5628  int is_root
5629 ) {
5630 
5631  // It is assumed __kmp_forkjoin_lock is acquired.
5632 
5633  int gtid;
5634 
5635  KMP_DEBUG_ASSERT( thread != NULL );
5636 
5637  gtid = thread->th.th_info.ds.ds_gtid;
5638 
5639  if ( ! is_root ) {
5640 
5641  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5642  /* Assume the threads are at the fork barrier here */
5643  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5644  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5645  kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5646  __kmp_release_64(&flag);
5647  }; // if
5648 
5649  // Terminate OS thread.
5650  __kmp_reap_worker( thread );
5651 
5652  //
5653  // The thread was killed asynchronously. If it was actively
5654  // spinning in the thread pool, decrement the global count.
5655  //
5656  // There is a small timing hole here - if the worker thread was
5657  // just waking up after sleeping in the pool, had reset it's
5658  // th_active_in_pool flag but not decremented the global counter
5659  // __kmp_thread_pool_active_nth yet, then the global counter
5660  // might not get updated.
5661  //
5662  // Currently, this can only happen as the library is unloaded,
5663  // so there are no harmful side effects.
5664  //
5665  if ( thread->th.th_active_in_pool ) {
5666  thread->th.th_active_in_pool = FALSE;
5667  KMP_TEST_THEN_DEC32(
5668  (kmp_int32 *) &__kmp_thread_pool_active_nth );
5669  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5670  }
5671 
5672  // Decrement # of [worker] threads in the pool.
5673  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5674  --__kmp_thread_pool_nth;
5675  }; // if
5676 
5677  // Free the fast memory for tasking
5678  #if USE_FAST_MEMORY
5679  __kmp_free_fast_memory( thread );
5680  #endif /* USE_FAST_MEMORY */
5681 
5682  __kmp_suspend_uninitialize_thread( thread );
5683 
5684  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5685  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5686 
5687  -- __kmp_all_nth;
5688  // __kmp_nth was decremented when thread is added to the pool.
5689 
5690 #ifdef KMP_ADJUST_BLOCKTIME
5691  /* Adjust blocktime back to user setting or default if necessary */
5692  /* Middle initialization might never have occurred */
5693  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5694  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5695  if ( __kmp_nth <= __kmp_avail_proc ) {
5696  __kmp_zero_bt = FALSE;
5697  }
5698  }
5699 #endif /* KMP_ADJUST_BLOCKTIME */
5700 
5701  /* free the memory being used */
5702  if( __kmp_env_consistency_check ) {
5703  if ( thread->th.th_cons ) {
5704  __kmp_free_cons_stack( thread->th.th_cons );
5705  thread->th.th_cons = NULL;
5706  }; // if
5707  }
5708 
5709  if ( thread->th.th_pri_common != NULL ) {
5710  __kmp_free( thread->th.th_pri_common );
5711  thread->th.th_pri_common = NULL;
5712  }; // if
5713 
5714  if (thread->th.th_task_state_memo_stack != NULL) {
5715  __kmp_free(thread->th.th_task_state_memo_stack);
5716  thread->th.th_task_state_memo_stack = NULL;
5717  }
5718 
5719  #if KMP_USE_BGET
5720  if ( thread->th.th_local.bget_data != NULL ) {
5721  __kmp_finalize_bget( thread );
5722  }; // if
5723  #endif
5724 
5725 #if KMP_AFFINITY_SUPPORTED
5726  if ( thread->th.th_affin_mask != NULL ) {
5727  KMP_CPU_FREE( thread->th.th_affin_mask );
5728  thread->th.th_affin_mask = NULL;
5729  }; // if
5730 #endif /* KMP_AFFINITY_SUPPORTED */
5731 
5732  __kmp_reap_team( thread->th.th_serial_team );
5733  thread->th.th_serial_team = NULL;
5734  __kmp_free( thread );
5735 
5736  KMP_MB();
5737 
5738 } // __kmp_reap_thread
5739 
5740 static void
5741 __kmp_internal_end(void)
5742 {
5743  int i;
5744 
5745  /* First, unregister the library */
5746  __kmp_unregister_library();
5747 
5748  #if KMP_OS_WINDOWS
5749  /* In Win static library, we can't tell when a root actually dies, so we
5750  reclaim the data structures for any root threads that have died but not
5751  unregistered themselves, in order to shut down cleanly.
5752  In Win dynamic library we also can't tell when a thread dies.
5753  */
5754  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5755  #endif
5756 
5757  for( i=0 ; i<__kmp_threads_capacity ; i++ )
5758  if( __kmp_root[i] )
5759  if( __kmp_root[i]->r.r_active )
5760  break;
5761  KMP_MB(); /* Flush all pending memory write invalidates. */
5762  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5763 
5764  if ( i < __kmp_threads_capacity ) {
5765  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5766  KMP_MB(); /* Flush all pending memory write invalidates. */
5767 
5768  //
5769  // Need to check that monitor was initialized before reaping it.
5770  // If we are called form __kmp_atfork_child (which sets
5771  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5772  // contain valid data, but it is only valid in the parent process,
5773  // not the child.
5774  //
5775  // New behavior (201008): instead of keying off of the flag
5776  // __kmp_init_parallel, the monitor thread creation is keyed off
5777  // of the new flag __kmp_init_monitor.
5778  //
5779  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5780  if ( TCR_4( __kmp_init_monitor ) ) {
5781  __kmp_reap_monitor( & __kmp_monitor );
5782  TCW_4( __kmp_init_monitor, 0 );
5783  }
5784  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5785  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5786  } else {
5787  /* TODO move this to cleanup code */
5788  #ifdef KMP_DEBUG
5789  /* make sure that everything has properly ended */
5790  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5791  if( __kmp_root[i] ) {
5792 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: there can be uber threads alive here
5793  KMP_ASSERT( ! __kmp_root[i]->r.r_active ); // TODO: can they be active?
5794  }
5795  }
5796  #endif
5797 
5798  KMP_MB();
5799 
5800  // Reap the worker threads.
5801  // This is valid for now, but be careful if threads are reaped sooner.
5802  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
5803  // Get the next thread from the pool.
5804  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5805  __kmp_thread_pool = thread->th.th_next_pool;
5806  // Reap it.
5807  thread->th.th_next_pool = NULL;
5808  thread->th.th_in_pool = FALSE;
5809  __kmp_reap_thread( thread, 0 );
5810  }; // while
5811  __kmp_thread_pool_insert_pt = NULL;
5812 
5813  // Reap teams.
5814  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
5815  // Get the next team from the pool.
5816  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5817  __kmp_team_pool = team->t.t_next_pool;
5818  // Reap it.
5819  team->t.t_next_pool = NULL;
5820  __kmp_reap_team( team );
5821  }; // while
5822 
5823  __kmp_reap_task_teams( );
5824 
5825  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5826  // TBD: Add some checking...
5827  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5828  }
5829 
5830  /* Make sure all threadprivate destructors get run by joining with all worker
5831  threads before resetting this flag */
5832  TCW_SYNC_4(__kmp_init_common, FALSE);
5833 
5834  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5835  KMP_MB();
5836 
5837  //
5838  // See note above: One of the possible fixes for CQ138434 / CQ140126
5839  //
5840  // FIXME: push both code fragments down and CSE them?
5841  // push them into __kmp_cleanup() ?
5842  //
5843  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5844  if ( TCR_4( __kmp_init_monitor ) ) {
5845  __kmp_reap_monitor( & __kmp_monitor );
5846  TCW_4( __kmp_init_monitor, 0 );
5847  }
5848  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5849  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5850 
5851  } /* else !__kmp_global.t_active */
5852  TCW_4(__kmp_init_gtid, FALSE);
5853  KMP_MB(); /* Flush all pending memory write invalidates. */
5854 
5855  __kmp_cleanup();
5856 #if OMPT_SUPPORT
5857  ompt_fini();
5858 #endif
5859 }
5860 
5861 void
5862 __kmp_internal_end_library( int gtid_req )
5863 {
5864  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5865  /* this shouldn't be a race condition because __kmp_internal_end() is the
5866  * only place to clear __kmp_serial_init */
5867  /* we'll check this later too, after we get the lock */
5868  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5869  // because the next check will work in any case.
5870  if( __kmp_global.g.g_abort ) {
5871  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5872  /* TODO abort? */
5873  return;
5874  }
5875  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5876  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5877  return;
5878  }
5879 
5880 
5881  KMP_MB(); /* Flush all pending memory write invalidates. */
5882 
5883  /* find out who we are and what we should do */
5884  {
5885  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5886  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
5887  if( gtid == KMP_GTID_SHUTDOWN ) {
5888  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5889  return;
5890  } else if( gtid == KMP_GTID_MONITOR ) {
5891  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5892  return;
5893  } else if( gtid == KMP_GTID_DNE ) {
5894  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5895  /* we don't know who we are, but we may still shutdown the library */
5896  } else if( KMP_UBER_GTID( gtid )) {
5897  /* unregister ourselves as an uber thread. gtid is no longer valid */
5898  if( __kmp_root[gtid]->r.r_active ) {
5899  __kmp_global.g.g_abort = -1;
5900  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5901  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5902  return;
5903  } else {
5904  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5905  __kmp_unregister_root_current_thread( gtid );
5906  }
5907  } else {
5908  /* worker threads may call this function through the atexit handler, if they call exit() */
5909  /* For now, skip the usual subsequent processing and just dump the debug buffer.
5910  TODO: do a thorough shutdown instead
5911  */
5912  #ifdef DUMP_DEBUG_ON_EXIT
5913  if ( __kmp_debug_buf )
5914  __kmp_dump_debug_buffer( );
5915  #endif
5916  return;
5917  }
5918  }
5919  /* synchronize the termination process */
5920  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5921 
5922  /* have we already finished */
5923  if( __kmp_global.g.g_abort ) {
5924  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5925  /* TODO abort? */
5926  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5927  return;
5928  }
5929  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5930  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5931  return;
5932  }
5933 
5934  /* We need this lock to enforce mutex between this reading of
5935  __kmp_threads_capacity and the writing by __kmp_register_root.
5936  Alternatively, we can use a counter of roots that is
5937  atomically updated by __kmp_get_global_thread_id_reg,
5938  __kmp_do_serial_initialize and __kmp_internal_end_*.
5939  */
5940  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
5941 
5942  /* now we can safely conduct the actual termination */
5943  __kmp_internal_end();
5944 
5945  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5946  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5947 
5948  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
5949 
5950  #ifdef DUMP_DEBUG_ON_EXIT
5951  if ( __kmp_debug_buf )
5952  __kmp_dump_debug_buffer();
5953  #endif
5954 
5955  #if KMP_OS_WINDOWS
5956  __kmp_close_console();
5957  #endif
5958 
5959  __kmp_fini_allocator();
5960 
5961 } // __kmp_internal_end_library
5962 
5963 void
5964 __kmp_internal_end_thread( int gtid_req )
5965 {
5966  int i;
5967 
5968  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5969  /* this shouldn't be a race condition because __kmp_internal_end() is the
5970  * only place to clear __kmp_serial_init */
5971  /* we'll check this later too, after we get the lock */
5972  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
5973  // because the next check will work in any case.
5974  if( __kmp_global.g.g_abort ) {
5975  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
5976  /* TODO abort? */
5977  return;
5978  }
5979  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5980  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
5981  return;
5982  }
5983 
5984  KMP_MB(); /* Flush all pending memory write invalidates. */
5985 
5986  /* find out who we are and what we should do */
5987  {
5988  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5989  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
5990  if( gtid == KMP_GTID_SHUTDOWN ) {
5991  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
5992  return;
5993  } else if( gtid == KMP_GTID_MONITOR ) {
5994  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
5995  return;
5996  } else if( gtid == KMP_GTID_DNE ) {
5997  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
5998  return;
5999  /* we don't know who we are */
6000  } else if( KMP_UBER_GTID( gtid )) {
6001  /* unregister ourselves as an uber thread. gtid is no longer valid */
6002  if( __kmp_root[gtid]->r.r_active ) {
6003  __kmp_global.g.g_abort = -1;
6004  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6005  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6006  return;
6007  } else {
6008  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6009  __kmp_unregister_root_current_thread( gtid );
6010  }
6011  } else {
6012  /* just a worker thread, let's leave */
6013  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6014 
6015  if ( gtid >= 0 ) {
6016  __kmp_threads[gtid]->th.th_task_team = NULL;
6017  }
6018 
6019  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6020  return;
6021  }
6022  }
6023  #if defined KMP_DYNAMIC_LIB
6024  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6025  // because we will better shutdown later in the library destructor.
6026  // The reason of this change is performance problem when non-openmp thread
6027  // in a loop forks and joins many openmp threads. We can save a lot of time
6028  // keeping worker threads alive until the program shutdown.
6029  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6030  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6031  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6032  return;
6033  #endif
6034  /* synchronize the termination process */
6035  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6036 
6037  /* have we already finished */
6038  if( __kmp_global.g.g_abort ) {
6039  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6040  /* TODO abort? */
6041  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6042  return;
6043  }
6044  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6045  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6046  return;
6047  }
6048 
6049  /* We need this lock to enforce mutex between this reading of
6050  __kmp_threads_capacity and the writing by __kmp_register_root.
6051  Alternatively, we can use a counter of roots that is
6052  atomically updated by __kmp_get_global_thread_id_reg,
6053  __kmp_do_serial_initialize and __kmp_internal_end_*.
6054  */
6055 
6056  /* should we finish the run-time? are all siblings done? */
6057  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6058 
6059  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6060  if ( KMP_UBER_GTID( i ) ) {
6061  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6062  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6063  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6064  return;
6065  };
6066  }
6067 
6068  /* now we can safely conduct the actual termination */
6069 
6070  __kmp_internal_end();
6071 
6072  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6073  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6074 
6075  KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6076 
6077  #ifdef DUMP_DEBUG_ON_EXIT
6078  if ( __kmp_debug_buf )
6079  __kmp_dump_debug_buffer();
6080  #endif
6081 } // __kmp_internal_end_thread
6082 
6083 // -------------------------------------------------------------------------------------------------
6084 // Library registration stuff.
6085 
6086 static long __kmp_registration_flag = 0;
6087  // Random value used to indicate library initialization.
6088 static char * __kmp_registration_str = NULL;
6089  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6090 
6091 
6092 static inline
6093 char *
6094 __kmp_reg_status_name() {
6095  /*
6096  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6097  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6098  the name of registered_lib_env env var can not be found, because the name will contain different pid.
6099  */
6100  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6101 } // __kmp_reg_status_get
6102 
6103 
6104 void
6105 __kmp_register_library_startup(
6106  void
6107 ) {
6108 
6109  char * name = __kmp_reg_status_name(); // Name of the environment variable.
6110  int done = 0;
6111  union {
6112  double dtime;
6113  long ltime;
6114  } time;
6115  #if KMP_OS_WINDOWS
6116  __kmp_initialize_system_tick();
6117  #endif
6118  __kmp_read_system_time( & time.dtime );
6119  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6120  __kmp_registration_str =
6121  __kmp_str_format(
6122  "%p-%lx-%s",
6123  & __kmp_registration_flag,
6124  __kmp_registration_flag,
6125  KMP_LIBRARY_FILE
6126  );
6127 
6128  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6129 
6130  while ( ! done ) {
6131 
6132  char * value = NULL; // Actual value of the environment variable.
6133 
6134  // Set environment variable, but do not overwrite if it is exist.
6135  __kmp_env_set( name, __kmp_registration_str, 0 );
6136  // Check the variable is written.
6137  value = __kmp_env_get( name );
6138  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6139 
6140  done = 1; // Ok, environment variable set successfully, exit the loop.
6141 
6142  } else {
6143 
6144  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6145  // Check whether it alive or dead.
6146  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6147  char * tail = value;
6148  char * flag_addr_str = NULL;
6149  char * flag_val_str = NULL;
6150  char const * file_name = NULL;
6151  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6152  __kmp_str_split( tail, '-', & flag_val_str, & tail );
6153  file_name = tail;
6154  if ( tail != NULL ) {
6155  long * flag_addr = 0;
6156  long flag_val = 0;
6157  KMP_SSCANF( flag_addr_str, "%p", & flag_addr );
6158  KMP_SSCANF( flag_val_str, "%lx", & flag_val );
6159  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6160  // First, check whether environment-encoded address is mapped into addr space.
6161  // If so, dereference it to see if it still has the right value.
6162 
6163  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6164  neighbor = 1;
6165  } else {
6166  // If not, then we know the other copy of the library is no longer running.
6167  neighbor = 2;
6168  }; // if
6169  }; // if
6170  }; // if
6171  switch ( neighbor ) {
6172  case 0 : // Cannot parse environment variable -- neighbor status unknown.
6173  // Assume it is the incompatible format of future version of the library.
6174  // Assume the other library is alive.
6175  // WARN( ... ); // TODO: Issue a warning.
6176  file_name = "unknown library";
6177  // Attention! Falling to the next case. That's intentional.
6178  case 1 : { // Neighbor is alive.
6179  // Check it is allowed.
6180  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6181  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6182  // That's not allowed. Issue fatal error.
6183  __kmp_msg(
6184  kmp_ms_fatal,
6185  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6186  KMP_HNT( DuplicateLibrary ),
6187  __kmp_msg_null
6188  );
6189  }; // if
6190  KMP_INTERNAL_FREE( duplicate_ok );
6191  __kmp_duplicate_library_ok = 1;
6192  done = 1; // Exit the loop.
6193  } break;
6194  case 2 : { // Neighbor is dead.
6195  // Clear the variable and try to register library again.
6196  __kmp_env_unset( name );
6197  } break;
6198  default : {
6199  KMP_DEBUG_ASSERT( 0 );
6200  } break;
6201  }; // switch
6202 
6203  }; // if
6204  KMP_INTERNAL_FREE( (void *) value );
6205 
6206  }; // while
6207  KMP_INTERNAL_FREE( (void *) name );
6208 
6209 } // func __kmp_register_library_startup
6210 
6211 
6212 void
6213 __kmp_unregister_library( void ) {
6214 
6215  char * name = __kmp_reg_status_name();
6216  char * value = __kmp_env_get( name );
6217 
6218  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6219  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
6220  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6221  // Ok, this is our variable. Delete it.
6222  __kmp_env_unset( name );
6223  }; // if
6224 
6225  KMP_INTERNAL_FREE( __kmp_registration_str );
6226  KMP_INTERNAL_FREE( value );
6227  KMP_INTERNAL_FREE( name );
6228 
6229  __kmp_registration_flag = 0;
6230  __kmp_registration_str = NULL;
6231 
6232 } // __kmp_unregister_library
6233 
6234 
6235 // End of Library registration stuff.
6236 // -------------------------------------------------------------------------------------------------
6237 
6238 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6239 
6240 static void __kmp_check_mic_type()
6241 {
6242  kmp_cpuid_t cpuid_state = {0};
6243  kmp_cpuid_t * cs_p = &cpuid_state;
6244  __kmp_x86_cpuid(1, 0, cs_p);
6245  // We don't support mic1 at the moment
6246  if( (cs_p->eax & 0xff0) == 0xB10 ) {
6247  __kmp_mic_type = mic2;
6248  } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6249  __kmp_mic_type = mic3;
6250  } else {
6251  __kmp_mic_type = non_mic;
6252  }
6253 }
6254 
6255 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6256 
6257 static void
6258 __kmp_do_serial_initialize( void )
6259 {
6260  int i, gtid;
6261  int size;
6262 
6263  KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6264 
6265  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6266  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6267  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6268  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6269  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6270 
6271 #if OMPT_SUPPORT
6272  ompt_pre_init();
6273 #endif
6274 
6275  __kmp_validate_locks();
6276 
6277  /* Initialize internal memory allocator */
6278  __kmp_init_allocator();
6279 
6280  /* Register the library startup via an environment variable
6281  and check to see whether another copy of the library is already
6282  registered. */
6283 
6284  __kmp_register_library_startup( );
6285 
6286  /* TODO reinitialization of library */
6287  if( TCR_4(__kmp_global.g.g_done) ) {
6288  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6289  }
6290 
6291  __kmp_global.g.g_abort = 0;
6292  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6293 
6294  /* initialize the locks */
6295 #if KMP_USE_ADAPTIVE_LOCKS
6296 #if KMP_DEBUG_ADAPTIVE_LOCKS
6297  __kmp_init_speculative_stats();
6298 #endif
6299 #endif
6300 #if KMP_STATS_ENABLED
6301  __kmp_init_tas_lock( & __kmp_stats_lock );
6302 #endif
6303  __kmp_init_lock( & __kmp_global_lock );
6304  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6305  __kmp_init_lock( & __kmp_debug_lock );
6306  __kmp_init_atomic_lock( & __kmp_atomic_lock );
6307  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
6308  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
6309  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
6310  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
6311  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
6312  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
6313  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
6314  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6315  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6316  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6317  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6318  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6319  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
6320  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
6321  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
6322  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6323 
6324  /* conduct initialization and initial setup of configuration */
6325 
6326  __kmp_runtime_initialize();
6327 
6328 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6329  __kmp_check_mic_type();
6330 #endif
6331 
6332  // Some global variable initialization moved here from kmp_env_initialize()
6333 #ifdef KMP_DEBUG
6334  kmp_diag = 0;
6335 #endif
6336  __kmp_abort_delay = 0;
6337 
6338  // From __kmp_init_dflt_team_nth()
6339  /* assume the entire machine will be used */
6340  __kmp_dflt_team_nth_ub = __kmp_xproc;
6341  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6342  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6343  }
6344  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6345  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6346  }
6347  __kmp_max_nth = __kmp_sys_max_nth;
6348 
6349  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6350  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6351  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6352  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6353  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6354  __kmp_library = library_throughput;
6355  // From KMP_SCHEDULE initialization
6356  __kmp_static = kmp_sch_static_balanced;
6357  // AC: do not use analytical here, because it is non-monotonous
6358  //__kmp_guided = kmp_sch_guided_iterative_chunked;
6359  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6360  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6361  // control parts
6362  #if KMP_FAST_REDUCTION_BARRIER
6363  #define kmp_reduction_barrier_gather_bb ((int)1)
6364  #define kmp_reduction_barrier_release_bb ((int)1)
6365  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6366  #define kmp_reduction_barrier_release_pat bp_hyper_bar
6367  #endif // KMP_FAST_REDUCTION_BARRIER
6368  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6369  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6370  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6371  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6372  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6373  #if KMP_FAST_REDUCTION_BARRIER
6374  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6375  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6376  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6377  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6378  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6379  }
6380  #endif // KMP_FAST_REDUCTION_BARRIER
6381  }
6382  #if KMP_FAST_REDUCTION_BARRIER
6383  #undef kmp_reduction_barrier_release_pat
6384  #undef kmp_reduction_barrier_gather_pat
6385  #undef kmp_reduction_barrier_release_bb
6386  #undef kmp_reduction_barrier_gather_bb
6387  #endif // KMP_FAST_REDUCTION_BARRIER
6388 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6389  if (__kmp_mic_type == mic2) { // KNC
6390  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6391  __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3; // plain gather
6392  __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1; // forkjoin release
6393  __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6394  __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6395  }
6396 #if KMP_FAST_REDUCTION_BARRIER
6397  if (__kmp_mic_type == mic2) { // KNC
6398  __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6399  __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6400  }
6401 #endif
6402 #endif
6403 
6404  // From KMP_CHECKS initialization
6405 #ifdef KMP_DEBUG
6406  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6407 #else
6408  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6409 #endif
6410 
6411  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6412  __kmp_foreign_tp = TRUE;
6413 
6414  __kmp_global.g.g_dynamic = FALSE;
6415  __kmp_global.g.g_dynamic_mode = dynamic_default;
6416 
6417  __kmp_env_initialize( NULL );
6418 
6419  // Print all messages in message catalog for testing purposes.
6420  #ifdef KMP_DEBUG
6421  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6422  if ( __kmp_str_match_true( val ) ) {
6423  kmp_str_buf_t buffer;
6424  __kmp_str_buf_init( & buffer );
6425  __kmp_i18n_dump_catalog( & buffer );
6426  __kmp_printf( "%s", buffer.str );
6427  __kmp_str_buf_free( & buffer );
6428  }; // if
6429  __kmp_env_free( & val );
6430  #endif
6431 
6432  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6433  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6434  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6435 
6436  // If the library is shut down properly, both pools must be NULL. Just in case, set them
6437  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6438  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6439  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6440  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
6441  __kmp_thread_pool = NULL;
6442  __kmp_thread_pool_insert_pt = NULL;
6443  __kmp_team_pool = NULL;
6444 
6445  /* Allocate all of the variable sized records */
6446  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6447  /* Since allocation is cache-aligned, just add extra padding at the end */
6448  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6449  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6450  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6451 
6452  /* init thread counts */
6453  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6454  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
6455  __kmp_all_nth = 0;
6456  __kmp_nth = 0;
6457 
6458  /* setup the uber master thread and hierarchy */
6459  gtid = __kmp_register_root( TRUE );
6460  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
6461  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6462  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6463 
6464  KMP_MB(); /* Flush all pending memory write invalidates. */
6465 
6466  __kmp_common_initialize();
6467 
6468  #if KMP_OS_UNIX
6469  /* invoke the child fork handler */
6470  __kmp_register_atfork();
6471  #endif
6472 
6473  #if ! defined KMP_DYNAMIC_LIB
6474  {
6475  /* Invoke the exit handler when the program finishes, only for static library.
6476  For dynamic library, we already have _fini and DllMain.
6477  */
6478  int rc = atexit( __kmp_internal_end_atexit );
6479  if ( rc != 0 ) {
6480  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6481  }; // if
6482  }
6483  #endif
6484 
6485  #if KMP_HANDLE_SIGNALS
6486  #if KMP_OS_UNIX
6487  /* NOTE: make sure that this is called before the user installs
6488  * their own signal handlers so that the user handlers
6489  * are called first. this way they can return false,
6490  * not call our handler, avoid terminating the library,
6491  * and continue execution where they left off. */
6492  __kmp_install_signals( FALSE );
6493  #endif /* KMP_OS_UNIX */
6494  #if KMP_OS_WINDOWS
6495  __kmp_install_signals( TRUE );
6496  #endif /* KMP_OS_WINDOWS */
6497  #endif
6498 
6499  /* we have finished the serial initialization */
6500  __kmp_init_counter ++;
6501 
6502  __kmp_init_serial = TRUE;
6503 
6504  if (__kmp_settings) {
6505  __kmp_env_print();
6506  }
6507 
6508 #if OMP_40_ENABLED
6509  if (__kmp_display_env || __kmp_display_env_verbose) {
6510  __kmp_env_print_2();
6511  }
6512 #endif // OMP_40_ENABLED
6513 
6514 #if OMPT_SUPPORT
6515  ompt_post_init();
6516 #endif
6517 
6518  KMP_MB();
6519 
6520  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6521 }
6522 
6523 void
6524 __kmp_serial_initialize( void )
6525 {
6526  if ( __kmp_init_serial ) {
6527  return;
6528  }
6529  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6530  if ( __kmp_init_serial ) {
6531  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6532  return;
6533  }
6534  __kmp_do_serial_initialize();
6535  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6536 }
6537 
6538 static void
6539 __kmp_do_middle_initialize( void )
6540 {
6541  int i, j;
6542  int prev_dflt_team_nth;
6543 
6544  if( !__kmp_init_serial ) {
6545  __kmp_do_serial_initialize();
6546  }
6547 
6548  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6549 
6550  //
6551  // Save the previous value for the __kmp_dflt_team_nth so that
6552  // we can avoid some reinitialization if it hasn't changed.
6553  //
6554  prev_dflt_team_nth = __kmp_dflt_team_nth;
6555 
6556 #if KMP_AFFINITY_SUPPORTED
6557  //
6558  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6559  // number of cores on the machine.
6560  //
6561  __kmp_affinity_initialize();
6562 
6563  //
6564  // Run through the __kmp_threads array and set the affinity mask
6565  // for each root thread that is currently registered with the RTL.
6566  //
6567  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6568  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6569  __kmp_affinity_set_init_mask( i, TRUE );
6570  }
6571  }
6572 #endif /* KMP_AFFINITY_SUPPORTED */
6573 
6574  KMP_ASSERT( __kmp_xproc > 0 );
6575  if ( __kmp_avail_proc == 0 ) {
6576  __kmp_avail_proc = __kmp_xproc;
6577  }
6578 
6579  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6580  j = 0;
6581  while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6582  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6583  j++;
6584  }
6585 
6586  if ( __kmp_dflt_team_nth == 0 ) {
6587 #ifdef KMP_DFLT_NTH_CORES
6588  //
6589  // Default #threads = #cores
6590  //
6591  __kmp_dflt_team_nth = __kmp_ncores;
6592  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6593  __kmp_dflt_team_nth ) );
6594 #else
6595  //
6596  // Default #threads = #available OS procs
6597  //
6598  __kmp_dflt_team_nth = __kmp_avail_proc;
6599  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6600  __kmp_dflt_team_nth ) );
6601 #endif /* KMP_DFLT_NTH_CORES */
6602  }
6603 
6604  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6605  __kmp_dflt_team_nth = KMP_MIN_NTH;
6606  }
6607  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6608  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6609  }
6610 
6611  //
6612  // There's no harm in continuing if the following check fails,
6613  // but it indicates an error in the previous logic.
6614  //
6615  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6616 
6617  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6618  //
6619  // Run through the __kmp_threads array and set the num threads icv
6620  // for each root thread that is currently registered with the RTL
6621  // (which has not already explicitly set its nthreads-var with a
6622  // call to omp_set_num_threads()).
6623  //
6624  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6625  kmp_info_t *thread = __kmp_threads[ i ];
6626  if ( thread == NULL ) continue;
6627  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6628 
6629  set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6630  }
6631  }
6632  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6633  __kmp_dflt_team_nth) );
6634 
6635 #ifdef KMP_ADJUST_BLOCKTIME
6636  /* Adjust blocktime to zero if necessary */
6637  /* now that __kmp_avail_proc is set */
6638  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6639  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6640  if ( __kmp_nth > __kmp_avail_proc ) {
6641  __kmp_zero_bt = TRUE;
6642  }
6643  }
6644 #endif /* KMP_ADJUST_BLOCKTIME */
6645 
6646  /* we have finished middle initialization */
6647  TCW_SYNC_4(__kmp_init_middle, TRUE);
6648 
6649  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6650 }
6651 
6652 void
6653 __kmp_middle_initialize( void )
6654 {
6655  if ( __kmp_init_middle ) {
6656  return;
6657  }
6658  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6659  if ( __kmp_init_middle ) {
6660  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6661  return;
6662  }
6663  __kmp_do_middle_initialize();
6664  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6665 }
6666 
6667 void
6668 __kmp_parallel_initialize( void )
6669 {
6670  int gtid = __kmp_entry_gtid(); // this might be a new root
6671 
6672  /* synchronize parallel initialization (for sibling) */
6673  if( TCR_4(__kmp_init_parallel) ) return;
6674  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6675  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6676 
6677  /* TODO reinitialization after we have already shut down */
6678  if( TCR_4(__kmp_global.g.g_done) ) {
6679  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6680  __kmp_infinite_loop();
6681  }
6682 
6683  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6684  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
6685  */
6686  if( !__kmp_init_middle ) {
6687  __kmp_do_middle_initialize();
6688  }
6689 
6690  /* begin initialization */
6691  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6692  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6693 
6694 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6695  //
6696  // Save the FP control regs.
6697  // Worker threads will set theirs to these values at thread startup.
6698  //
6699  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6700  __kmp_store_mxcsr( &__kmp_init_mxcsr );
6701  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6702 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6703 
6704 #if KMP_OS_UNIX
6705 # if KMP_HANDLE_SIGNALS
6706  /* must be after __kmp_serial_initialize */
6707  __kmp_install_signals( TRUE );
6708 # endif
6709 #endif
6710 
6711  __kmp_suspend_initialize();
6712 
6713 #if defined(USE_LOAD_BALANCE)
6714  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6715  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6716  }
6717 #else
6718  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6719  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6720  }
6721 #endif
6722 
6723  if ( __kmp_version ) {
6724  __kmp_print_version_2();
6725  }
6726 
6727  /* we have finished parallel initialization */
6728  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6729 
6730  KMP_MB();
6731  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6732 
6733  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6734 }
6735 
6736 
6737 /* ------------------------------------------------------------------------ */
6738 
6739 void
6740 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6741  kmp_team_t *team )
6742 {
6743  kmp_disp_t *dispatch;
6744 
6745  KMP_MB();
6746 
6747  /* none of the threads have encountered any constructs, yet. */
6748  this_thr->th.th_local.this_construct = 0;
6749 #if KMP_CACHE_MANAGE
6750  KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6751 #endif /* KMP_CACHE_MANAGE */
6752  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6753  KMP_DEBUG_ASSERT( dispatch );
6754  KMP_DEBUG_ASSERT( team->t.t_dispatch );
6755  //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6756 
6757  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6758 #if OMP_45_ENABLED
6759  dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
6760 #endif
6761  if( __kmp_env_consistency_check )
6762  __kmp_push_parallel( gtid, team->t.t_ident );
6763 
6764  KMP_MB(); /* Flush all pending memory write invalidates. */
6765 }
6766 
6767 void
6768 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6769  kmp_team_t *team )
6770 {
6771  if( __kmp_env_consistency_check )
6772  __kmp_pop_parallel( gtid, team->t.t_ident );
6773 }
6774 
6775 int
6776 __kmp_invoke_task_func( int gtid )
6777 {
6778  int rc;
6779  int tid = __kmp_tid_from_gtid( gtid );
6780  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6781  kmp_team_t *team = this_thr->th.th_team;
6782 
6783  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6784 #if USE_ITT_BUILD
6785  if ( __itt_stack_caller_create_ptr ) {
6786  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6787  }
6788 #endif /* USE_ITT_BUILD */
6789 #if INCLUDE_SSC_MARKS
6790  SSC_MARK_INVOKING();
6791 #endif
6792 
6793 #if OMPT_SUPPORT
6794  void *dummy;
6795  void **exit_runtime_p;
6796  ompt_task_id_t my_task_id;
6797  ompt_parallel_id_t my_parallel_id;
6798 
6799  if (ompt_enabled) {
6800  exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6801  ompt_task_info.frame.exit_runtime_frame);
6802  } else {
6803  exit_runtime_p = &dummy;
6804  }
6805 
6806 #if OMPT_TRACE
6807  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6808  my_parallel_id = team->t.ompt_team_info.parallel_id;
6809  if (ompt_enabled &&
6810  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6811  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6812  my_parallel_id, my_task_id);
6813  }
6814 #endif
6815 #endif
6816 
6817  {
6818  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6819  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6820  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6821  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6822 #if OMPT_SUPPORT
6823  , exit_runtime_p
6824 #endif
6825  );
6826  }
6827 
6828 #if USE_ITT_BUILD
6829  if ( __itt_stack_caller_create_ptr ) {
6830  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6831  }
6832 #endif /* USE_ITT_BUILD */
6833  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6834 
6835  return rc;
6836 }
6837 
6838 #if OMP_40_ENABLED
6839 void
6840 __kmp_teams_master( int gtid )
6841 {
6842  // This routine is called by all master threads in teams construct
6843  kmp_info_t *thr = __kmp_threads[ gtid ];
6844  kmp_team_t *team = thr->th.th_team;
6845  ident_t *loc = team->t.t_ident;
6846  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6847  KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6848  KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6849  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6850  gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6851  // Launch league of teams now, but not let workers execute
6852  // (they hang on fork barrier until next parallel)
6853 #if INCLUDE_SSC_MARKS
6854  SSC_MARK_FORKING();
6855 #endif
6856  __kmp_fork_call( loc, gtid, fork_context_intel,
6857  team->t.t_argc,
6858 #if OMPT_SUPPORT
6859  (void *)thr->th.th_teams_microtask, // "unwrapped" task
6860 #endif
6861  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6862  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6863  NULL );
6864 #if INCLUDE_SSC_MARKS
6865  SSC_MARK_JOINING();
6866 #endif
6867 
6868  // AC: last parameter "1" eliminates join barrier which won't work because
6869  // worker threads are in a fork barrier waiting for more parallel regions
6870  __kmp_join_call( loc, gtid
6871 #if OMPT_SUPPORT
6872  , fork_context_intel
6873 #endif
6874  , 1 );
6875 }
6876 
6877 int
6878 __kmp_invoke_teams_master( int gtid )
6879 {
6880  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6881  kmp_team_t *team = this_thr->th.th_team;
6882  #if KMP_DEBUG
6883  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6884  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6885  #endif
6886  __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6887  __kmp_teams_master( gtid );
6888  __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6889  return 1;
6890 }
6891 #endif /* OMP_40_ENABLED */
6892 
6893 /* this sets the requested number of threads for the next parallel region
6894  * encountered by this team */
6895 /* since this should be enclosed in the forkjoin critical section it
6896  * should avoid race conditions with assymmetrical nested parallelism */
6897 
6898 void
6899 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6900 {
6901  kmp_info_t *thr = __kmp_threads[gtid];
6902 
6903  if( num_threads > 0 )
6904  thr->th.th_set_nproc = num_threads;
6905 }
6906 
6907 #if OMP_40_ENABLED
6908 
6909 /* this sets the requested number of teams for the teams region and/or
6910  * the number of threads for the next parallel region encountered */
6911 void
6912 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6913 {
6914  kmp_info_t *thr = __kmp_threads[gtid];
6915  KMP_DEBUG_ASSERT(num_teams >= 0);
6916  KMP_DEBUG_ASSERT(num_threads >= 0);
6917 
6918  if( num_teams == 0 )
6919  num_teams = 1; // default number of teams is 1.
6920  if( num_teams > __kmp_max_nth ) { // if too many teams requested?
6921  if ( !__kmp_reserve_warn ) {
6922  __kmp_reserve_warn = 1;
6923  __kmp_msg(
6924  kmp_ms_warning,
6925  KMP_MSG( CantFormThrTeam, num_teams, __kmp_max_nth ),
6926  KMP_HNT( Unset_ALL_THREADS ),
6927  __kmp_msg_null
6928  );
6929  }
6930  num_teams = __kmp_max_nth;
6931  }
6932  // Set number of teams (number of threads in the outer "parallel" of the teams)
6933  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6934 
6935  // Remember the number of threads for inner parallel regions
6936  if( num_threads == 0 ) {
6937  if( !TCR_4(__kmp_init_middle) )
6938  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
6939  num_threads = __kmp_avail_proc / num_teams;
6940  if( num_teams * num_threads > __kmp_max_nth ) {
6941  // adjust num_threads w/o warning as it is not user setting
6942  num_threads = __kmp_max_nth / num_teams;
6943  }
6944  } else {
6945  if( num_teams * num_threads > __kmp_max_nth ) {
6946  int new_threads = __kmp_max_nth / num_teams;
6947  if ( !__kmp_reserve_warn ) { // user asked for too many threads
6948  __kmp_reserve_warn = 1; // that conflicts with OMP_THREAD_LIMIT
6949  __kmp_msg(
6950  kmp_ms_warning,
6951  KMP_MSG( CantFormThrTeam, num_threads, new_threads ),
6952  KMP_HNT( Unset_ALL_THREADS ),
6953  __kmp_msg_null
6954  );
6955  }
6956  num_threads = new_threads;
6957  }
6958  }
6959  thr->th.th_teams_size.nth = num_threads;
6960 }
6961 
6962 
6963 //
6964 // Set the proc_bind var to use in the following parallel region.
6965 //
6966 void
6967 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
6968 {
6969  kmp_info_t *thr = __kmp_threads[gtid];
6970  thr->th.th_set_proc_bind = proc_bind;
6971 }
6972 
6973 #endif /* OMP_40_ENABLED */
6974 
6975 /* Launch the worker threads into the microtask. */
6976 
6977 void
6978 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
6979 {
6980  kmp_info_t *this_thr = __kmp_threads[gtid];
6981 
6982 #ifdef KMP_DEBUG
6983  int f;
6984 #endif /* KMP_DEBUG */
6985 
6986  KMP_DEBUG_ASSERT( team );
6987  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
6988  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
6989  KMP_MB(); /* Flush all pending memory write invalidates. */
6990 
6991  team->t.t_construct = 0; /* no single directives seen yet */
6992  team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
6993 
6994  /* Reset the identifiers on the dispatch buffer */
6995  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
6996  if ( team->t.t_max_nproc > 1 ) {
6997  int i;
6998  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
6999  team->t.t_disp_buffer[ i ].buffer_index = i;
7000 #if OMP_45_ENABLED
7001  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7002 #endif
7003  }
7004  } else {
7005  team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7006 #if OMP_45_ENABLED
7007  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7008 #endif
7009  }
7010 
7011  KMP_MB(); /* Flush all pending memory write invalidates. */
7012  KMP_ASSERT( this_thr->th.th_team == team );
7013 
7014 #ifdef KMP_DEBUG
7015  for( f=0 ; f<team->t.t_nproc ; f++ ) {
7016  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7017  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7018  }
7019 #endif /* KMP_DEBUG */
7020 
7021  /* release the worker threads so they may begin working */
7022  __kmp_fork_barrier( gtid, 0 );
7023 }
7024 
7025 
7026 void
7027 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7028 {
7029  kmp_info_t *this_thr = __kmp_threads[gtid];
7030 
7031  KMP_DEBUG_ASSERT( team );
7032  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7033  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7034  KMP_MB(); /* Flush all pending memory write invalidates. */
7035 
7036  /* Join barrier after fork */
7037 
7038 #ifdef KMP_DEBUG
7039  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7040  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7041  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7042  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7043  __kmp_print_structure();
7044  }
7045  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7046  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7047 #endif /* KMP_DEBUG */
7048 
7049  __kmp_join_barrier( gtid ); /* wait for everyone */
7050 
7051  KMP_MB(); /* Flush all pending memory write invalidates. */
7052  KMP_ASSERT( this_thr->th.th_team == team );
7053 }
7054 
7055 
7056 /* ------------------------------------------------------------------------ */
7057 /* ------------------------------------------------------------------------ */
7058 
7059 #ifdef USE_LOAD_BALANCE
7060 
7061 //
7062 // Return the worker threads actively spinning in the hot team, if we
7063 // are at the outermost level of parallelism. Otherwise, return 0.
7064 //
7065 static int
7066 __kmp_active_hot_team_nproc( kmp_root_t *root )
7067 {
7068  int i;
7069  int retval;
7070  kmp_team_t *hot_team;
7071 
7072  if ( root->r.r_active ) {
7073  return 0;
7074  }
7075  hot_team = root->r.r_hot_team;
7076  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7077  return hot_team->t.t_nproc - 1; // Don't count master thread
7078  }
7079 
7080  //
7081  // Skip the master thread - it is accounted for elsewhere.
7082  //
7083  retval = 0;
7084  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7085  if ( hot_team->t.t_threads[i]->th.th_active ) {
7086  retval++;
7087  }
7088  }
7089  return retval;
7090 }
7091 
7092 //
7093 // Perform an automatic adjustment to the number of
7094 // threads used by the next parallel region.
7095 //
7096 static int
7097 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7098 {
7099  int retval;
7100  int pool_active;
7101  int hot_team_active;
7102  int team_curr_active;
7103  int system_active;
7104 
7105  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7106  root, set_nproc ) );
7107  KMP_DEBUG_ASSERT( root );
7108  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7109  KMP_DEBUG_ASSERT( set_nproc > 1 );
7110 
7111  if ( set_nproc == 1) {
7112  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7113  return 1;
7114  }
7115 
7116  //
7117  // Threads that are active in the thread pool, active in the hot team
7118  // for this particular root (if we are at the outer par level), and
7119  // the currently executing thread (to become the master) are available
7120  // to add to the new team, but are currently contributing to the system
7121  // load, and must be accounted for.
7122  //
7123  pool_active = TCR_4(__kmp_thread_pool_active_nth);
7124  hot_team_active = __kmp_active_hot_team_nproc( root );
7125  team_curr_active = pool_active + hot_team_active + 1;
7126 
7127  //
7128  // Check the system load.
7129  //
7130  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7131  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7132  system_active, pool_active, hot_team_active ) );
7133 
7134  if ( system_active < 0 ) {
7135  //
7136  // There was an error reading the necessary info from /proc,
7137  // so use the thread limit algorithm instead. Once we set
7138  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7139  // we shouldn't wind up getting back here.
7140  //
7141  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7142  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7143 
7144  //
7145  // Make this call behave like the thread limit algorithm.
7146  //
7147  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7148  : root->r.r_hot_team->t.t_nproc);
7149  if ( retval > set_nproc ) {
7150  retval = set_nproc;
7151  }
7152  if ( retval < KMP_MIN_NTH ) {
7153  retval = KMP_MIN_NTH;
7154  }
7155 
7156  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7157  return retval;
7158  }
7159 
7160  //
7161  // There is a slight delay in the load balance algorithm in detecting
7162  // new running procs. The real system load at this instant should be
7163  // at least as large as the #active omp thread that are available to
7164  // add to the team.
7165  //
7166  if ( system_active < team_curr_active ) {
7167  system_active = team_curr_active;
7168  }
7169  retval = __kmp_avail_proc - system_active + team_curr_active;
7170  if ( retval > set_nproc ) {
7171  retval = set_nproc;
7172  }
7173  if ( retval < KMP_MIN_NTH ) {
7174  retval = KMP_MIN_NTH;
7175  }
7176 
7177  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7178  return retval;
7179 } // __kmp_load_balance_nproc()
7180 
7181 #endif /* USE_LOAD_BALANCE */
7182 
7183 /* ------------------------------------------------------------------------ */
7184 /* ------------------------------------------------------------------------ */
7185 
7186 /* NOTE: this is called with the __kmp_init_lock held */
7187 void
7188 __kmp_cleanup( void )
7189 {
7190  int f;
7191 
7192  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7193 
7194  if (TCR_4(__kmp_init_parallel)) {
7195 #if KMP_HANDLE_SIGNALS
7196  __kmp_remove_signals();
7197 #endif
7198  TCW_4(__kmp_init_parallel, FALSE);
7199  }
7200 
7201  if (TCR_4(__kmp_init_middle)) {
7202 #if KMP_AFFINITY_SUPPORTED
7203  __kmp_affinity_uninitialize();
7204 #endif /* KMP_AFFINITY_SUPPORTED */
7205  __kmp_cleanup_hierarchy();
7206  TCW_4(__kmp_init_middle, FALSE);
7207  }
7208 
7209  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7210 
7211  if (__kmp_init_serial) {
7212  __kmp_runtime_destroy();
7213  __kmp_init_serial = FALSE;
7214  }
7215 
7216  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7217  if ( __kmp_root[ f ] != NULL ) {
7218  __kmp_free( __kmp_root[ f ] );
7219  __kmp_root[ f ] = NULL;
7220  }
7221  }
7222  __kmp_free( __kmp_threads );
7223  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7224  // freeing __kmp_root.
7225  __kmp_threads = NULL;
7226  __kmp_root = NULL;
7227  __kmp_threads_capacity = 0;
7228 
7229 #if KMP_USE_DYNAMIC_LOCK
7230  __kmp_cleanup_indirect_user_locks();
7231 #else
7232  __kmp_cleanup_user_locks();
7233 #endif
7234 
7235  #if KMP_AFFINITY_SUPPORTED
7236  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7237  __kmp_cpuinfo_file = NULL;
7238  #endif /* KMP_AFFINITY_SUPPORTED */
7239 
7240  #if KMP_USE_ADAPTIVE_LOCKS
7241  #if KMP_DEBUG_ADAPTIVE_LOCKS
7242  __kmp_print_speculative_stats();
7243  #endif
7244  #endif
7245  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7246  __kmp_nested_nth.nth = NULL;
7247  __kmp_nested_nth.size = 0;
7248  __kmp_nested_nth.used = 0;
7249 
7250  __kmp_i18n_catclose();
7251 
7252 #if KMP_STATS_ENABLED
7253  __kmp_accumulate_stats_at_exit();
7254  __kmp_stats_list.deallocate();
7255 #endif
7256 
7257  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7258 }
7259 
7260 /* ------------------------------------------------------------------------ */
7261 /* ------------------------------------------------------------------------ */
7262 
7263 int
7264 __kmp_ignore_mppbeg( void )
7265 {
7266  char *env;
7267 
7268  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7269  if (__kmp_str_match_false( env ))
7270  return FALSE;
7271  }
7272  // By default __kmpc_begin() is no-op.
7273  return TRUE;
7274 }
7275 
7276 int
7277 __kmp_ignore_mppend( void )
7278 {
7279  char *env;
7280 
7281  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7282  if (__kmp_str_match_false( env ))
7283  return FALSE;
7284  }
7285  // By default __kmpc_end() is no-op.
7286  return TRUE;
7287 }
7288 
7289 void
7290 __kmp_internal_begin( void )
7291 {
7292  int gtid;
7293  kmp_root_t *root;
7294 
7295  /* this is a very important step as it will register new sibling threads
7296  * and assign these new uber threads a new gtid */
7297  gtid = __kmp_entry_gtid();
7298  root = __kmp_threads[ gtid ]->th.th_root;
7299  KMP_ASSERT( KMP_UBER_GTID( gtid ));
7300 
7301  if( root->r.r_begin ) return;
7302  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7303  if( root->r.r_begin ) {
7304  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7305  return;
7306  }
7307 
7308  root->r.r_begin = TRUE;
7309 
7310  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7311 }
7312 
7313 
7314 /* ------------------------------------------------------------------------ */
7315 /* ------------------------------------------------------------------------ */
7316 
7317 void
7318 __kmp_user_set_library (enum library_type arg)
7319 {
7320  int gtid;
7321  kmp_root_t *root;
7322  kmp_info_t *thread;
7323 
7324  /* first, make sure we are initialized so we can get our gtid */
7325 
7326  gtid = __kmp_entry_gtid();
7327  thread = __kmp_threads[ gtid ];
7328 
7329  root = thread->th.th_root;
7330 
7331  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7332  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7333  KMP_WARNING( SetLibraryIncorrectCall );
7334  return;
7335  }
7336 
7337  switch ( arg ) {
7338  case library_serial :
7339  thread->th.th_set_nproc = 0;
7340  set__nproc( thread, 1 );
7341  break;
7342  case library_turnaround :
7343  thread->th.th_set_nproc = 0;
7344  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7345  break;
7346  case library_throughput :
7347  thread->th.th_set_nproc = 0;
7348  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7349  break;
7350  default:
7351  KMP_FATAL( UnknownLibraryType, arg );
7352  }
7353 
7354  __kmp_aux_set_library ( arg );
7355 }
7356 
7357 void
7358 __kmp_aux_set_stacksize( size_t arg )
7359 {
7360  if (! __kmp_init_serial)
7361  __kmp_serial_initialize();
7362 
7363 #if KMP_OS_DARWIN
7364  if (arg & (0x1000 - 1)) {
7365  arg &= ~(0x1000 - 1);
7366  if(arg + 0x1000) /* check for overflow if we round up */
7367  arg += 0x1000;
7368  }
7369 #endif
7370  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7371 
7372  /* only change the default stacksize before the first parallel region */
7373  if (! TCR_4(__kmp_init_parallel)) {
7374  size_t value = arg; /* argument is in bytes */
7375 
7376  if (value < __kmp_sys_min_stksize )
7377  value = __kmp_sys_min_stksize ;
7378  else if (value > KMP_MAX_STKSIZE)
7379  value = KMP_MAX_STKSIZE;
7380 
7381  __kmp_stksize = value;
7382 
7383  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7384  }
7385 
7386  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7387 }
7388 
7389 /* set the behaviour of the runtime library */
7390 /* TODO this can cause some odd behaviour with sibling parallelism... */
7391 void
7392 __kmp_aux_set_library (enum library_type arg)
7393 {
7394  __kmp_library = arg;
7395 
7396  switch ( __kmp_library ) {
7397  case library_serial :
7398  {
7399  KMP_INFORM( LibraryIsSerial );
7400  (void) __kmp_change_library( TRUE );
7401  }
7402  break;
7403  case library_turnaround :
7404  (void) __kmp_change_library( TRUE );
7405  break;
7406  case library_throughput :
7407  (void) __kmp_change_library( FALSE );
7408  break;
7409  default:
7410  KMP_FATAL( UnknownLibraryType, arg );
7411  }
7412 }
7413 
7414 /* ------------------------------------------------------------------------ */
7415 /* ------------------------------------------------------------------------ */
7416 
7417 void
7418 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7419 {
7420  int blocktime = arg; /* argument is in milliseconds */
7421  int bt_intervals;
7422  int bt_set;
7423 
7424  __kmp_save_internal_controls( thread );
7425 
7426  /* Normalize and set blocktime for the teams */
7427  if (blocktime < KMP_MIN_BLOCKTIME)
7428  blocktime = KMP_MIN_BLOCKTIME;
7429  else if (blocktime > KMP_MAX_BLOCKTIME)
7430  blocktime = KMP_MAX_BLOCKTIME;
7431 
7432  set__blocktime_team( thread->th.th_team, tid, blocktime );
7433  set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7434 
7435  /* Calculate and set blocktime intervals for the teams */
7436  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7437 
7438  set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7439  set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7440 
7441  /* Set whether blocktime has been set to "TRUE" */
7442  bt_set = TRUE;
7443 
7444  set__bt_set_team( thread->th.th_team, tid, bt_set );
7445  set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7446  KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
7447  __kmp_gtid_from_tid(tid, thread->th.th_team),
7448  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
7449 }
7450 
7451 void
7452 __kmp_aux_set_defaults(
7453  char const * str,
7454  int len
7455 ) {
7456  if ( ! __kmp_init_serial ) {
7457  __kmp_serial_initialize();
7458  };
7459  __kmp_env_initialize( str );
7460 
7461  if (__kmp_settings
7462 #if OMP_40_ENABLED
7463  || __kmp_display_env || __kmp_display_env_verbose
7464 #endif // OMP_40_ENABLED
7465  ) {
7466  __kmp_env_print();
7467  }
7468 } // __kmp_aux_set_defaults
7469 
7470 /* ------------------------------------------------------------------------ */
7471 
7472 /*
7473  * internal fast reduction routines
7474  */
7475 
7476 PACKED_REDUCTION_METHOD_T
7477 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7478  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7479  kmp_critical_name *lck )
7480 {
7481 
7482  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7483  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7484  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7485  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7486 
7487  PACKED_REDUCTION_METHOD_T retval;
7488 
7489  int team_size;
7490 
7491  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
7492  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
7493 
7494  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7495  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
7496 
7497  retval = critical_reduce_block;
7498 
7499  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7500 
7501  if( team_size == 1 ) {
7502 
7503  retval = empty_reduce_block;
7504 
7505  } else {
7506 
7507  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7508  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7509 
7510  #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7511 
7512  #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7513 
7514  int teamsize_cutoff = 4;
7515 
7516 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7517  if( __kmp_mic_type != non_mic ) {
7518  teamsize_cutoff = 8;
7519  }
7520 #endif
7521  if( tree_available ) {
7522  if( team_size <= teamsize_cutoff ) {
7523  if ( atomic_available ) {
7524  retval = atomic_reduce_block;
7525  }
7526  } else {
7527  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7528  }
7529  } else if ( atomic_available ) {
7530  retval = atomic_reduce_block;
7531  }
7532  #else
7533  #error "Unknown or unsupported OS"
7534  #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7535 
7536  #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7537 
7538  #if KMP_OS_LINUX || KMP_OS_WINDOWS
7539 
7540  // basic tuning
7541 
7542  if( atomic_available ) {
7543  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7544  retval = atomic_reduce_block;
7545  }
7546  } // otherwise: use critical section
7547 
7548  #elif KMP_OS_DARWIN
7549 
7550  if( atomic_available && ( num_vars <= 3 ) ) {
7551  retval = atomic_reduce_block;
7552  } else if( tree_available ) {
7553  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7554  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7555  }
7556  } // otherwise: use critical section
7557 
7558  #else
7559  #error "Unknown or unsupported OS"
7560  #endif
7561 
7562  #else
7563  #error "Unknown or unsupported architecture"
7564  #endif
7565 
7566  }
7567 
7568  // KMP_FORCE_REDUCTION
7569 
7570  // If the team is serialized (team_size == 1), ignore the forced reduction
7571  // method and stay with the unsynchronized method (empty_reduce_block)
7572  if( __kmp_force_reduction_method != reduction_method_not_defined && team_size != 1) {
7573 
7574  PACKED_REDUCTION_METHOD_T forced_retval;
7575 
7576  int atomic_available, tree_available;
7577 
7578  switch( ( forced_retval = __kmp_force_reduction_method ) )
7579  {
7580  case critical_reduce_block:
7581  KMP_ASSERT( lck ); // lck should be != 0
7582  break;
7583 
7584  case atomic_reduce_block:
7585  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7586  KMP_ASSERT( atomic_available ); // atomic_available should be != 0
7587  break;
7588 
7589  case tree_reduce_block:
7590  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7591  KMP_ASSERT( tree_available ); // tree_available should be != 0
7592  #if KMP_FAST_REDUCTION_BARRIER
7593  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7594  #endif
7595  break;
7596 
7597  default:
7598  KMP_ASSERT( 0 ); // "unsupported method specified"
7599  }
7600 
7601  retval = forced_retval;
7602  }
7603 
7604  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7605 
7606  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7607  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7608 
7609  return ( retval );
7610 }
7611 
7612 // this function is for testing set/get/determine reduce method
7613 kmp_int32
7614 __kmp_get_reduce_method( void ) {
7615  return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7616 }
7617 
7618 /* ------------------------------------------------------------------------ */
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:765
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:485
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:177
Definition: kmp.h:194
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:470
sched_type
Definition: kmp.h:295
kmp_int32 flags
Definition: kmp.h:196