LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  * it may change values between parallel regions. __kmp_max_nth
21  * is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35  #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49  static const T mx;
50  static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54  static const int mx = 0x7fffffff;
55  static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59  static const unsigned int mx = 0xffffffff;
60  static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64  static const long long mx = 0x7fffffffffffffffLL;
65  static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69  static const unsigned long long mx = 0xffffffffffffffffLL;
70  static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77  template< typename T >
78  struct dispatch_private_infoXX_template {
79  typedef typename traits_t< T >::unsigned_t UT;
80  typedef typename traits_t< T >::signed_t ST;
81  UT count; // unsigned
82  T ub;
83  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84  T lb;
85  ST st; // signed
86  UT tc; // unsigned
87  T static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89  /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92  // a) parm3 is properly aligned and
93  // b) all parm1-4 are in the same cache line.
94  // Because of parm1-4 are used together, performance seems to be better
95  // if they are in the same line (not measured though).
96 
97  struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98  T parm1;
99  T parm2;
100  T parm3;
101  T parm4;
102  };
103 
104  UT ordered_lower; // unsigned
105  UT ordered_upper; // unsigned
106  #if KMP_OS_WINDOWS
107  T last_upper;
108  #endif /* KMP_OS_WINDOWS */
109  };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114  template< typename T >
115  struct dispatch_private_infoXX_template {
116  typedef typename traits_t< T >::unsigned_t UT;
117  typedef typename traits_t< T >::signed_t ST;
118  T lb;
119  T ub;
120  ST st; // signed
121  UT tc; // unsigned
122 
123  T parm1;
124  T parm2;
125  T parm3;
126  T parm4;
127 
128  UT count; // unsigned
129 
130  UT ordered_lower; // unsigned
131  UT ordered_upper; // unsigned
132  #if KMP_OS_WINDOWS
133  T last_upper;
134  #endif /* KMP_OS_WINDOWS */
135  };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142  // duplicate alignment here, otherwise size of structure is not correct in our compiler
143  union KMP_ALIGN_CACHE private_info_tmpl {
144  dispatch_private_infoXX_template< T > p;
145  dispatch_private_info64_t p64;
146  } u;
147  enum sched_type schedule; /* scheduling algorithm */
148  kmp_uint32 ordered; /* ordered clause specified */
149  kmp_uint32 ordered_bumped;
150  kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151  dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152  kmp_uint32 nomerge; /* don't merge iters if serialized */
153  kmp_uint32 type_size;
154  enum cons_type pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161  /* chunk index under dynamic, number of idle threads under static-steal;
162  iteration index otherwise */
163  volatile UT iteration;
164  volatile UT num_done;
165  volatile UT ordered_iteration;
166  UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172  // we need union here to keep the structure size
173  union shared_info_tmpl {
174  dispatch_shared_infoXX_template< UT > s;
175  dispatch_shared_info64_t s64;
176  } u;
177  volatile kmp_uint32 buffer_index;
178 };
179 
180 /* ------------------------------------------------------------------------ */
181 /* ------------------------------------------------------------------------ */
182 
183 #undef USE_TEST_LOCKS
184 
185 // test_then_add template (general template should NOT be used)
186 template< typename T >
187 static __forceinline T
188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189 
190 template<>
191 __forceinline kmp_int32
192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193 {
194  kmp_int32 r;
195  r = KMP_TEST_THEN_ADD32( p, d );
196  return r;
197 }
198 
199 template<>
200 __forceinline kmp_int64
201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202 {
203  kmp_int64 r;
204  r = KMP_TEST_THEN_ADD64( p, d );
205  return r;
206 }
207 
208 // test_then_inc_acq template (general template should NOT be used)
209 template< typename T >
210 static __forceinline T
211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212 
213 template<>
214 __forceinline kmp_int32
215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216 {
217  kmp_int32 r;
218  r = KMP_TEST_THEN_INC_ACQ32( p );
219  return r;
220 }
221 
222 template<>
223 __forceinline kmp_int64
224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225 {
226  kmp_int64 r;
227  r = KMP_TEST_THEN_INC_ACQ64( p );
228  return r;
229 }
230 
231 // test_then_inc template (general template should NOT be used)
232 template< typename T >
233 static __forceinline T
234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235 
236 template<>
237 __forceinline kmp_int32
238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239 {
240  kmp_int32 r;
241  r = KMP_TEST_THEN_INC32( p );
242  return r;
243 }
244 
245 template<>
246 __forceinline kmp_int64
247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248 {
249  kmp_int64 r;
250  r = KMP_TEST_THEN_INC64( p );
251  return r;
252 }
253 
254 // compare_and_swap template (general template should NOT be used)
255 template< typename T >
256 static __forceinline kmp_int32
257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258 
259 template<>
260 __forceinline kmp_int32
261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262 {
263  return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264 }
265 
266 template<>
267 __forceinline kmp_int32
268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269 {
270  return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271 }
272 
273 /*
274  Spin wait loop that first does pause, then yield.
275  Waits until function returns non-zero when called with *spinner and check.
276  Does NOT put threads to sleep.
277 #if USE_ITT_BUILD
278  Arguments:
279  obj -- is higher-level synchronization object to report to ittnotify. It is used to report
280  locks consistently. For example, if lock is acquired immediately, its address is
281  reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282  immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283  address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285 */
286 template< typename UT >
287 // ToDo: make inline function (move to header file for icl)
288 static UT // unsigned 4- or 8-byte type
289 __kmp_wait_yield( volatile UT * spinner,
290  UT checker,
291  kmp_uint32 (* pred)( UT, UT )
292  USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
293  )
294 {
295  // note: we may not belong to a team at this point
296  register volatile UT * spin = spinner;
297  register UT check = checker;
298  register kmp_uint32 spins;
299  register kmp_uint32 (*f) ( UT, UT ) = pred;
300  register UT r;
301 
302  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303  KMP_INIT_YIELD( spins );
304  // main wait spin loop
305  while(!f(r = *spin, check))
306  {
307  KMP_FSYNC_SPIN_PREPARE( obj );
308  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309  It causes problems with infinite recursion because of exit lock */
310  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311  __kmp_abort_thread(); */
312 
313  // if we are oversubscribed,
314  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315  // pause is in the following code
316  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317  KMP_YIELD_SPIN( spins );
318  }
319  KMP_FSYNC_SPIN_ACQUIRED( obj );
320  return r;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_eq( UT value, UT checker) {
325  return value == checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_neq( UT value, UT checker) {
330  return value != checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_lt( UT value, UT checker) {
335  return value < checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_ge( UT value, UT checker) {
340  return value >= checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_le( UT value, UT checker) {
345  return value <= checker;
346 }
347 
348 
349 /* ------------------------------------------------------------------------ */
350 /* ------------------------------------------------------------------------ */
351 
352 static void
353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354 {
355  kmp_info_t *th;
356 
357  KMP_DEBUG_ASSERT( gtid_ref );
358 
359  if ( __kmp_env_consistency_check ) {
360  th = __kmp_threads[*gtid_ref];
361  if ( th -> th.th_root -> r.r_active
362  && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
363 #if KMP_USE_DYNAMIC_LOCK
364  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365 #else
366  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
367 #endif
368  }
369  }
370 }
371 
372 template< typename UT >
373 static void
374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375 {
376  typedef typename traits_t< UT >::signed_t ST;
377  dispatch_private_info_template< UT > * pr;
378 
379  int gtid = *gtid_ref;
380 // int cid = *cid_ref;
381  kmp_info_t *th = __kmp_threads[ gtid ];
382  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383 
384  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385  if ( __kmp_env_consistency_check ) {
386  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387  ( th -> th.th_dispatch -> th_dispatch_pr_current );
388  if ( pr -> pushed_ws != ct_none ) {
389 #if KMP_USE_DYNAMIC_LOCK
390  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391 #else
392  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
393 #endif
394  }
395  }
396 
397  if ( ! th -> th.th_team -> t.t_serialized ) {
398  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399  ( th -> th.th_dispatch -> th_dispatch_sh_current );
400  UT lower;
401 
402  if ( ! __kmp_env_consistency_check ) {
403  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404  ( th -> th.th_dispatch -> th_dispatch_pr_current );
405  }
406  lower = pr->u.p.ordered_lower;
407 
408  #if ! defined( KMP_GOMP_COMPAT )
409  if ( __kmp_env_consistency_check ) {
410  if ( pr->ordered_bumped ) {
411  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412  __kmp_error_construct2(
413  kmp_i18n_msg_CnsMultipleNesting,
414  ct_ordered_in_pdo, loc_ref,
415  & p->stack_data[ p->w_top ]
416  );
417  }
418  }
419  #endif /* !defined(KMP_GOMP_COMPAT) */
420 
421  KMP_MB();
422  #ifdef KMP_DEBUG
423  {
424  const char * buff;
425  // create format specifiers before the debug output
426  buff = __kmp_str_format(
427  "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428  traits_t< UT >::spec, traits_t< UT >::spec );
429  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430  __kmp_str_free( &buff );
431  }
432  #endif
433 
434  __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435  USE_ITT_BUILD_ARG( NULL )
436  );
437  KMP_MB(); /* is this necessary? */
438  #ifdef KMP_DEBUG
439  {
440  const char * buff;
441  // create format specifiers before the debug output
442  buff = __kmp_str_format(
443  "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444  traits_t< UT >::spec, traits_t< UT >::spec );
445  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446  __kmp_str_free( &buff );
447  }
448  #endif
449  }
450  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451 }
452 
453 static void
454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455 {
456  kmp_info_t *th;
457 
458  if ( __kmp_env_consistency_check ) {
459  th = __kmp_threads[*gtid_ref];
460  if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461  __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462  }
463  }
464 }
465 
466 template< typename UT >
467 static void
468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469 {
470  typedef typename traits_t< UT >::signed_t ST;
471  dispatch_private_info_template< UT > * pr;
472 
473  int gtid = *gtid_ref;
474 // int cid = *cid_ref;
475  kmp_info_t *th = __kmp_threads[ gtid ];
476  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477 
478  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479  if ( __kmp_env_consistency_check ) {
480  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481  ( th -> th.th_dispatch -> th_dispatch_pr_current );
482  if ( pr -> pushed_ws != ct_none ) {
483  __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484  }
485  }
486 
487  if ( ! th -> th.th_team -> t.t_serialized ) {
488  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489  ( th -> th.th_dispatch -> th_dispatch_sh_current );
490 
491  if ( ! __kmp_env_consistency_check ) {
492  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493  ( th -> th.th_dispatch -> th_dispatch_pr_current );
494  }
495 
496  KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497  #if ! defined( KMP_GOMP_COMPAT )
498  if ( __kmp_env_consistency_check ) {
499  if ( pr->ordered_bumped != 0 ) {
500  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501  /* How to test it? - OM */
502  __kmp_error_construct2(
503  kmp_i18n_msg_CnsMultipleNesting,
504  ct_ordered_in_pdo, loc_ref,
505  & p->stack_data[ p->w_top ]
506  );
507  }
508  }
509  #endif /* !defined(KMP_GOMP_COMPAT) */
510 
511  KMP_MB(); /* Flush all pending memory write invalidates. */
512 
513  pr->ordered_bumped += 1;
514 
515  KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516  gtid, pr->ordered_bumped ) );
517 
518  KMP_MB(); /* Flush all pending memory write invalidates. */
519 
520  /* TODO use general release procedure? */
521  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522 
523  KMP_MB(); /* Flush all pending memory write invalidates. */
524  }
525  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526 }
527 
528 /* Computes and returns x to the power of y, where y must a non-negative integer */
529 template< typename UT >
530 static __forceinline long double
531 __kmp_pow(long double x, UT y) {
532  long double s=1.0L;
533 
534  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535  //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536  while(y) {
537  if ( y & 1 )
538  s *= x;
539  x *= x;
540  y >>= 1;
541  }
542  return s;
543 }
544 
545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546  (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547  __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549 */
550 template< typename T >
551 static __inline typename traits_t< T >::unsigned_t
552 __kmp_dispatch_guided_remaining(
553  T tc,
554  typename traits_t< T >::floating_t base,
555  typename traits_t< T >::unsigned_t idx
556 ) {
557  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558  least for ICL 8.1, long double arithmetic may not really have
559  long double precision, even with /Qlong_double. Currently, we
560  workaround that in the caller code, by manipulating the FPCW for
561  Windows* OS on IA-32 architecture. The lack of precision is not
562  expected to be a correctness issue, though.
563  */
564  typedef typename traits_t< T >::unsigned_t UT;
565 
566  long double x = tc * __kmp_pow< UT >(base, idx);
567  UT r = (UT) x;
568  if ( x == r )
569  return r;
570  return r + 1;
571 }
572 
573 // Parameters of the guided-iterative algorithm:
574 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
575 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578 static int guided_int_param = 2;
579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580 
581 // UT - unsigned flavor of T, ST - signed flavor of T,
582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583 template< typename T >
584 static void
585 __kmp_dispatch_init(
586  ident_t * loc,
587  int gtid,
588  enum sched_type schedule,
589  T lb,
590  T ub,
591  typename traits_t< T >::signed_t st,
592  typename traits_t< T >::signed_t chunk,
593  int push_ws
594 ) {
595  typedef typename traits_t< T >::unsigned_t UT;
596  typedef typename traits_t< T >::signed_t ST;
597  typedef typename traits_t< T >::floating_t DBL;
598  static const int ___kmp_size_type = sizeof( UT );
599 
600  int active;
601  T tc;
602  kmp_info_t * th;
603  kmp_team_t * team;
604  kmp_uint32 my_buffer_index;
605  dispatch_private_info_template< T > * pr;
606  dispatch_shared_info_template< UT > volatile * sh;
607 
608  KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609  KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610 
611  if ( ! TCR_4( __kmp_init_parallel ) )
612  __kmp_parallel_initialize();
613 
614 #if INCLUDE_SSC_MARKS
615  SSC_MARK_DISPATCH_INIT();
616 #endif
617  #ifdef KMP_DEBUG
618  {
619  const char * buff;
620  // create format specifiers before the debug output
621  buff = __kmp_str_format(
622  "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623  traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624  KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625  __kmp_str_free( &buff );
626  }
627  #endif
628  /* setup data */
629  th = __kmp_threads[ gtid ];
630  team = th -> th.th_team;
631  active = ! team -> t.t_serialized;
632  th->th.th_ident = loc;
633 
634 #if USE_ITT_BUILD
635  kmp_uint64 cur_chunk = chunk;
636  int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637  KMP_MASTER_GTID(gtid) &&
638 #if OMP_40_ENABLED
639  th->th.th_teams_microtask == NULL &&
640 #endif
641  team->t.t_active_level == 1;
642 #endif
643  if ( ! active ) {
644  pr = reinterpret_cast< dispatch_private_info_template< T >* >
645  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646  } else {
647  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649 
650  my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651 
652  /* What happens when number of threads changes, need to resize buffer? */
653  pr = reinterpret_cast< dispatch_private_info_template< T > * >
654  ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655  sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656  ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657  }
658 
659  /* Pick up the nomerge/ordered bits from the scheduling type */
660  if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661  pr->nomerge = TRUE;
662  schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663  } else {
664  pr->nomerge = FALSE;
665  }
666  pr->type_size = ___kmp_size_type; // remember the size of variables
667  if ( kmp_ord_lower & schedule ) {
668  pr->ordered = TRUE;
669  schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670  } else {
671  pr->ordered = FALSE;
672  }
673 
674  if ( schedule == kmp_sch_static ) {
675  schedule = __kmp_static;
676  } else {
677  if ( schedule == kmp_sch_runtime ) {
678  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
679  schedule = team -> t.t_sched.r_sched_type;
680  // Detail the schedule if needed (global controls are differentiated appropriately)
681  if ( schedule == kmp_sch_guided_chunked ) {
682  schedule = __kmp_guided;
683  } else if ( schedule == kmp_sch_static ) {
684  schedule = __kmp_static;
685  }
686  // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
687  chunk = team -> t.t_sched.chunk;
688 #if USE_ITT_BUILD
689  cur_chunk = chunk;
690 #endif
691  #ifdef KMP_DEBUG
692  {
693  const char * buff;
694  // create format specifiers before the debug output
695  buff = __kmp_str_format(
696  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
697  traits_t< ST >::spec );
698  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
699  __kmp_str_free( &buff );
700  }
701  #endif
702  } else {
703  if ( schedule == kmp_sch_guided_chunked ) {
704  schedule = __kmp_guided;
705  }
706  if ( chunk <= 0 ) {
707  chunk = KMP_DEFAULT_CHUNK;
708  }
709  }
710 
711  if ( schedule == kmp_sch_auto ) {
712  // mapping and differentiation: in the __kmp_do_serial_initialize()
713  schedule = __kmp_auto;
714  #ifdef KMP_DEBUG
715  {
716  const char * buff;
717  // create format specifiers before the debug output
718  buff = __kmp_str_format(
719  "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
720  traits_t< ST >::spec );
721  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
722  __kmp_str_free( &buff );
723  }
724  #endif
725  }
726 
727  /* guided analytical not safe for too many threads */
728  if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
729  schedule = kmp_sch_guided_iterative_chunked;
730  KMP_WARNING( DispatchManyThreads );
731  }
732  pr->u.p.parm1 = chunk;
733  }
734  KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
735  "unknown scheduling type" );
736 
737  pr->u.p.count = 0;
738 
739  if ( __kmp_env_consistency_check ) {
740  if ( st == 0 ) {
741  __kmp_error_construct(
742  kmp_i18n_msg_CnsLoopIncrZeroProhibited,
743  ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
744  );
745  }
746  }
747 
748  tc = ( ub - lb + st );
749  if ( st != 1 ) {
750  if ( st < 0 ) {
751  if ( lb < ub ) {
752  tc = 0; // zero-trip
753  } else { // lb >= ub
754  tc = (ST)tc / st; // convert to signed division
755  }
756  } else { // st > 0
757  if ( ub < lb ) {
758  tc = 0; // zero-trip
759  } else { // lb >= ub
760  tc /= st;
761  }
762  }
763  } else if ( ub < lb ) { // st == 1
764  tc = 0; // zero-trip
765  }
766 
767  // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
768  // when statistics are disabled.
769  if (schedule == __kmp_static)
770  {
771  KMP_COUNT_BLOCK(OMP_FOR_static);
772  KMP_COUNT_VALUE(FOR_static_iterations, tc);
773  }
774  else
775  {
776  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
777  KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
778  }
779 
780  pr->u.p.lb = lb;
781  pr->u.p.ub = ub;
782  pr->u.p.st = st;
783  pr->u.p.tc = tc;
784 
785  #if KMP_OS_WINDOWS
786  pr->u.p.last_upper = ub + st;
787  #endif /* KMP_OS_WINDOWS */
788 
789  /* NOTE: only the active parallel region(s) has active ordered sections */
790 
791  if ( active ) {
792  if ( pr->ordered == 0 ) {
793  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
794  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
795  } else {
796  pr->ordered_bumped = 0;
797 
798  pr->u.p.ordered_lower = 1;
799  pr->u.p.ordered_upper = 0;
800 
801  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
802  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
803  }
804  }
805 
806  if ( __kmp_env_consistency_check ) {
807  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
808  if ( push_ws ) {
809  __kmp_push_workshare( gtid, ws, loc );
810  pr->pushed_ws = ws;
811  } else {
812  __kmp_check_workshare( gtid, ws, loc );
813  pr->pushed_ws = ct_none;
814  }
815  }
816 
817  switch ( schedule ) {
818  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
820  {
821  T nproc = team->t.t_nproc;
822  T ntc, init;
823 
824  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
825 
826  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
827  if ( nproc > 1 && ntc >= nproc ) {
828  T id = __kmp_tid_from_gtid(gtid);
829  T small_chunk, extras;
830 
831  small_chunk = ntc / nproc;
832  extras = ntc % nproc;
833 
834  init = id * small_chunk + ( id < extras ? id : extras );
835  pr->u.p.count = init;
836  pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
837 
838  pr->u.p.parm2 = lb;
839  //pr->pfields.parm3 = 0; // it's not used in static_steal
840  pr->u.p.parm4 = id;
841  pr->u.p.st = st;
842  break;
843  } else {
844  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
845  gtid ) );
846  schedule = kmp_sch_static_balanced;
847  /* too few iterations: fall-through to kmp_sch_static_balanced */
848  } // if
849  /* FALL-THROUGH to static balanced */
850  } // case
851  #endif
852  case kmp_sch_static_balanced:
853  {
854  T nproc = team->t.t_nproc;
855  T init, limit;
856 
857  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
858  gtid ) );
859 
860  if ( nproc > 1 ) {
861  T id = __kmp_tid_from_gtid(gtid);
862 
863  if ( tc < nproc ) {
864  if ( id < tc ) {
865  init = id;
866  limit = id;
867  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
868  } else {
869  pr->u.p.count = 1; /* means no more chunks to execute */
870  pr->u.p.parm1 = FALSE;
871  break;
872  }
873  } else {
874  T small_chunk = tc / nproc;
875  T extras = tc % nproc;
876  init = id * small_chunk + (id < extras ? id : extras);
877  limit = init + small_chunk - (id < extras ? 0 : 1);
878  pr->u.p.parm1 = (id == nproc - 1);
879  }
880  } else {
881  if ( tc > 0 ) {
882  init = 0;
883  limit = tc - 1;
884  pr->u.p.parm1 = TRUE;
885  } else {
886  // zero trip count
887  pr->u.p.count = 1; /* means no more chunks to execute */
888  pr->u.p.parm1 = FALSE;
889  break;
890  }
891  }
892 #if USE_ITT_BUILD
893  // Calculate chunk for metadata report
894  if ( itt_need_metadata_reporting )
895  cur_chunk = limit - init + 1;
896 #endif
897  if ( st == 1 ) {
898  pr->u.p.lb = lb + init;
899  pr->u.p.ub = lb + limit;
900  } else {
901  T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
902  pr->u.p.lb = lb + init * st;
903  // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
904  if ( st > 0 ) {
905  pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
906  } else {
907  pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
908  }
909  }
910  if ( pr->ordered ) {
911  pr->u.p.ordered_lower = init;
912  pr->u.p.ordered_upper = limit;
913  }
914  break;
915  } // case
916  case kmp_sch_guided_iterative_chunked :
917  {
918  T nproc = team->t.t_nproc;
919  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
920 
921  if ( nproc > 1 ) {
922  if ( (2L * chunk + 1 ) * nproc >= tc ) {
923  /* chunk size too large, switch to dynamic */
924  schedule = kmp_sch_dynamic_chunked;
925  } else {
926  // when remaining iters become less than parm2 - switch to dynamic
927  pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
928  *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
929  }
930  } else {
931  KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
932  schedule = kmp_sch_static_greedy;
933  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
934  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
935  pr->u.p.parm1 = tc;
936  } // if
937  } // case
938  break;
939  case kmp_sch_guided_analytical_chunked:
940  {
941  T nproc = team->t.t_nproc;
942  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
943 
944  if ( nproc > 1 ) {
945  if ( (2L * chunk + 1 ) * nproc >= tc ) {
946  /* chunk size too large, switch to dynamic */
947  schedule = kmp_sch_dynamic_chunked;
948  } else {
949  /* commonly used term: (2 nproc - 1)/(2 nproc) */
950  DBL x;
951 
952  #if KMP_OS_WINDOWS && KMP_ARCH_X86
953  /* Linux* OS already has 64-bit computation by default for
954  long double, and on Windows* OS on Intel(R) 64,
955  /Qlong_double doesn't work. On Windows* OS
956  on IA-32 architecture, we need to set precision to
957  64-bit instead of the default 53-bit. Even though long
958  double doesn't work on Windows* OS on Intel(R) 64, the
959  resulting lack of precision is not expected to impact
960  the correctness of the algorithm, but this has not been
961  mathematically proven.
962  */
963  // save original FPCW and set precision to 64-bit, as
964  // Windows* OS on IA-32 architecture defaults to 53-bit
965  unsigned int oldFpcw = _control87(0,0);
966  _control87(_PC_64,_MCW_PC); // 0,0x30000
967  #endif
968  /* value used for comparison in solver for cross-over point */
969  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
970 
971  /* crossover point--chunk indexes equal to or greater than
972  this point switch to dynamic-style scheduling */
973  UT cross;
974 
975  /* commonly used term: (2 nproc - 1)/(2 nproc) */
976  x = (long double)1.0 - (long double)0.5 / nproc;
977 
978  #ifdef KMP_DEBUG
979  { // test natural alignment
980  struct _test_a {
981  char a;
982  union {
983  char b;
984  DBL d;
985  };
986  } t;
987  ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
988  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
989  KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
990  }
991  #endif // KMP_DEBUG
992 
993  /* save the term in thread private dispatch structure */
994  *(DBL*)&pr->u.p.parm3 = x;
995 
996  /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
997  {
998  UT left, right, mid;
999  long double p;
1000 
1001  /* estimate initial upper and lower bound */
1002 
1003  /* doesn't matter what value right is as long as it is positive, but
1004  it affects performance of the solver
1005  */
1006  right = 229;
1007  p = __kmp_pow< UT >(x,right);
1008  if ( p > target ) {
1009  do{
1010  p *= p;
1011  right <<= 1;
1012  } while(p>target && right < (1<<27));
1013  left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1014  } else {
1015  left = 0;
1016  }
1017 
1018  /* bisection root-finding method */
1019  while ( left + 1 < right ) {
1020  mid = (left + right) / 2;
1021  if ( __kmp_pow< UT >(x,mid) > target ) {
1022  left = mid;
1023  } else {
1024  right = mid;
1025  }
1026  } // while
1027  cross = right;
1028  }
1029  /* assert sanity of computed crossover point */
1030  KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1031 
1032  /* save the crossover point in thread private dispatch structure */
1033  pr->u.p.parm2 = cross;
1034 
1035  // C75803
1036  #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1037  #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1038  #else
1039  #define GUIDED_ANALYTICAL_WORKAROUND (x)
1040  #endif
1041  /* dynamic-style scheduling offset */
1042  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1043  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1044  // restore FPCW
1045  _control87(oldFpcw,_MCW_PC);
1046  #endif
1047  } // if
1048  } else {
1049  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1050  gtid ) );
1051  schedule = kmp_sch_static_greedy;
1052  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1053  pr->u.p.parm1 = tc;
1054  } // if
1055  } // case
1056  break;
1057  case kmp_sch_static_greedy:
1058  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1059  pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1060  ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1061  tc;
1062  break;
1063  case kmp_sch_static_chunked :
1064  case kmp_sch_dynamic_chunked :
1065  if ( pr->u.p.parm1 <= 0 ) {
1066  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1067  }
1068  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1069  break;
1070  case kmp_sch_trapezoidal :
1071  {
1072  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1073 
1074  T parm1, parm2, parm3, parm4;
1075  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1076 
1077  parm1 = chunk;
1078 
1079  /* F : size of the first cycle */
1080  parm2 = ( tc / (2 * team->t.t_nproc) );
1081 
1082  if ( parm2 < 1 ) {
1083  parm2 = 1;
1084  }
1085 
1086  /* L : size of the last cycle. Make sure the last cycle
1087  * is not larger than the first cycle.
1088  */
1089  if ( parm1 < 1 ) {
1090  parm1 = 1;
1091  } else if ( parm1 > parm2 ) {
1092  parm1 = parm2;
1093  }
1094 
1095  /* N : number of cycles */
1096  parm3 = ( parm2 + parm1 );
1097  parm3 = ( 2 * tc + parm3 - 1) / parm3;
1098 
1099  if ( parm3 < 2 ) {
1100  parm3 = 2;
1101  }
1102 
1103  /* sigma : decreasing incr of the trapezoid */
1104  parm4 = ( parm3 - 1 );
1105  parm4 = ( parm2 - parm1 ) / parm4;
1106 
1107  // pointless check, because parm4 >= 0 always
1108  //if ( parm4 < 0 ) {
1109  // parm4 = 0;
1110  //}
1111 
1112  pr->u.p.parm1 = parm1;
1113  pr->u.p.parm2 = parm2;
1114  pr->u.p.parm3 = parm3;
1115  pr->u.p.parm4 = parm4;
1116  } // case
1117  break;
1118 
1119  default:
1120  {
1121  __kmp_msg(
1122  kmp_ms_fatal, // Severity
1123  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1124  KMP_HNT( GetNewerLibrary ), // Hint
1125  __kmp_msg_null // Variadic argument list terminator
1126  );
1127  }
1128  break;
1129  } // switch
1130  pr->schedule = schedule;
1131  if ( active ) {
1132  /* The name of this buffer should be my_buffer_index when it's free to use it */
1133 
1134  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1135  gtid, my_buffer_index, sh->buffer_index) );
1136  __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1137  USE_ITT_BUILD_ARG( NULL )
1138  );
1139  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1140  // *always* 32-bit integers.
1141  KMP_MB(); /* is this necessary? */
1142  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1143  gtid, my_buffer_index, sh->buffer_index) );
1144 
1145  th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1146  th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1147 #if USE_ITT_BUILD
1148  if ( pr->ordered ) {
1149  __kmp_itt_ordered_init( gtid );
1150  }; // if
1151  // Report loop metadata
1152  if ( itt_need_metadata_reporting ) {
1153  // Only report metadata by master of active team at level 1
1154  kmp_uint64 schedtype = 0;
1155  switch ( schedule ) {
1156  case kmp_sch_static_chunked:
1157  case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1158  break;
1159  case kmp_sch_static_greedy:
1160  cur_chunk = pr->u.p.parm1;
1161  break;
1162  case kmp_sch_dynamic_chunked:
1163  schedtype = 1;
1164  break;
1165  case kmp_sch_guided_iterative_chunked:
1166  case kmp_sch_guided_analytical_chunked:
1167  schedtype = 2;
1168  break;
1169  default:
1170 // Should we put this case under "static"?
1171 // case kmp_sch_static_steal:
1172  schedtype = 3;
1173  break;
1174  }
1175  __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1176  }
1177 #endif /* USE_ITT_BUILD */
1178  }; // if
1179 
1180  #ifdef KMP_DEBUG
1181  {
1182  const char * buff;
1183  // create format specifiers before the debug output
1184  buff = __kmp_str_format(
1185  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1186  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1187  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1188  traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1189  traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1190  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1191  traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1192  KD_TRACE(10, ( buff,
1193  gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1194  pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1195  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1196  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1197  __kmp_str_free( &buff );
1198  }
1199  #endif
1200  #if ( KMP_STATIC_STEAL_ENABLED )
1201  if ( ___kmp_size_type < 8 ) {
1202  // It cannot be guaranteed that after execution of a loop with some other schedule kind
1203  // all the parm3 variables will contain the same value.
1204  // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1205  // rather than program life-time increment.
1206  // So the dedicated variable is required. The 'static_steal_counter' is used.
1207  if( schedule == kmp_sch_static_steal ) {
1208  // Other threads will inspect this variable when searching for a victim.
1209  // This is a flag showing that other threads may steal from this thread since then.
1210  volatile T * p = &pr->u.p.static_steal_counter;
1211  *p = *p + 1;
1212  }
1213  }
1214  #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1215 
1216 #if OMPT_SUPPORT && OMPT_TRACE
1217  if (ompt_enabled &&
1218  ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1219  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1220  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1221  ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1222  team_info->parallel_id, task_info->task_id, team_info->microtask);
1223  }
1224 #endif
1225 }
1226 
1227 /*
1228  * For ordered loops, either __kmp_dispatch_finish() should be called after
1229  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1230  * every chunk of iterations. If the ordered section(s) were not executed
1231  * for this iteration (or every iteration in this chunk), we need to set the
1232  * ordered iteration counters so that the next thread can proceed.
1233  */
1234 template< typename UT >
1235 static void
1236 __kmp_dispatch_finish( int gtid, ident_t *loc )
1237 {
1238  typedef typename traits_t< UT >::signed_t ST;
1239  kmp_info_t *th = __kmp_threads[ gtid ];
1240 
1241  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1242  if ( ! th -> th.th_team -> t.t_serialized ) {
1243 
1244  dispatch_private_info_template< UT > * pr =
1245  reinterpret_cast< dispatch_private_info_template< UT >* >
1246  ( th->th.th_dispatch->th_dispatch_pr_current );
1247  dispatch_shared_info_template< UT > volatile * sh =
1248  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1249  ( th->th.th_dispatch->th_dispatch_sh_current );
1250  KMP_DEBUG_ASSERT( pr );
1251  KMP_DEBUG_ASSERT( sh );
1252  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1253  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1254 
1255  if ( pr->ordered_bumped ) {
1256  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1257  gtid ) );
1258  pr->ordered_bumped = 0;
1259  } else {
1260  UT lower = pr->u.p.ordered_lower;
1261 
1262  #ifdef KMP_DEBUG
1263  {
1264  const char * buff;
1265  // create format specifiers before the debug output
1266  buff = __kmp_str_format(
1267  "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1268  traits_t< UT >::spec, traits_t< UT >::spec );
1269  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1270  __kmp_str_free( &buff );
1271  }
1272  #endif
1273 
1274  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1275  USE_ITT_BUILD_ARG(NULL)
1276  );
1277  KMP_MB(); /* is this necessary? */
1278  #ifdef KMP_DEBUG
1279  {
1280  const char * buff;
1281  // create format specifiers before the debug output
1282  buff = __kmp_str_format(
1283  "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1284  traits_t< UT >::spec, traits_t< UT >::spec );
1285  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1286  __kmp_str_free( &buff );
1287  }
1288  #endif
1289 
1290  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1291  } // if
1292  } // if
1293  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1294 }
1295 
1296 #ifdef KMP_GOMP_COMPAT
1297 
1298 template< typename UT >
1299 static void
1300 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1301 {
1302  typedef typename traits_t< UT >::signed_t ST;
1303  kmp_info_t *th = __kmp_threads[ gtid ];
1304 
1305  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1306  if ( ! th -> th.th_team -> t.t_serialized ) {
1307 // int cid;
1308  dispatch_private_info_template< UT > * pr =
1309  reinterpret_cast< dispatch_private_info_template< UT >* >
1310  ( th->th.th_dispatch->th_dispatch_pr_current );
1311  dispatch_shared_info_template< UT > volatile * sh =
1312  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1313  ( th->th.th_dispatch->th_dispatch_sh_current );
1314  KMP_DEBUG_ASSERT( pr );
1315  KMP_DEBUG_ASSERT( sh );
1316  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1317  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1318 
1319 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1320  UT lower = pr->u.p.ordered_lower;
1321  UT upper = pr->u.p.ordered_upper;
1322  UT inc = upper - lower + 1;
1323 
1324  if ( pr->ordered_bumped == inc ) {
1325  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1326  gtid ) );
1327  pr->ordered_bumped = 0;
1328  } else {
1329  inc -= pr->ordered_bumped;
1330 
1331  #ifdef KMP_DEBUG
1332  {
1333  const char * buff;
1334  // create format specifiers before the debug output
1335  buff = __kmp_str_format(
1336  "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1337  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1338  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1339  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1340  __kmp_str_free( &buff );
1341  }
1342  #endif
1343 
1344  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1345  USE_ITT_BUILD_ARG(NULL)
1346  );
1347 
1348  KMP_MB(); /* is this necessary? */
1349  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1350  gtid ) );
1351  pr->ordered_bumped = 0;
1353  #ifdef KMP_DEBUG
1354  {
1355  const char * buff;
1356  // create format specifiers before the debug output
1357  buff = __kmp_str_format(
1358  "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1359  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1360  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1361  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1362  __kmp_str_free( &buff );
1363  }
1364  #endif
1365 
1366  test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1367  }
1368 // }
1369  }
1370  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1371 }
1372 
1373 #endif /* KMP_GOMP_COMPAT */
1374 
1375 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1376  * (no more work), then tell OMPT the loop is over. In some cases
1377  * kmp_dispatch_fini() is not called. */
1378 #if OMPT_SUPPORT && OMPT_TRACE
1379 #define OMPT_LOOP_END \
1380  if (status == 0) { \
1381  if (ompt_enabled && \
1382  ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1383  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1384  ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1385  ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1386  team_info->parallel_id, task_info->task_id); \
1387  } \
1388  }
1389 #else
1390 #define OMPT_LOOP_END // no-op
1391 #endif
1392 
1393 template< typename T >
1394 static int
1395 __kmp_dispatch_next(
1396  ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1397 ) {
1398 
1399  typedef typename traits_t< T >::unsigned_t UT;
1400  typedef typename traits_t< T >::signed_t ST;
1401  typedef typename traits_t< T >::floating_t DBL;
1402 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1403  static const int ___kmp_size_type = sizeof( UT );
1404 #endif
1405 
1406  // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1407  // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1408  // more than a compile time choice to use static scheduling would.)
1409  KMP_TIME_BLOCK(FOR_dynamic_scheduling);
1410 
1411  int status;
1412  dispatch_private_info_template< T > * pr;
1413  kmp_info_t * th = __kmp_threads[ gtid ];
1414  kmp_team_t * team = th -> th.th_team;
1415 
1416  KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1417  #ifdef KMP_DEBUG
1418  {
1419  const char * buff;
1420  // create format specifiers before the debug output
1421  buff = __kmp_str_format(
1422  "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1423  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1424  KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1425  __kmp_str_free( &buff );
1426  }
1427  #endif
1428 
1429  if ( team -> t.t_serialized ) {
1430  /* NOTE: serialize this dispatch becase we are not at the active level */
1431  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1432  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1433  KMP_DEBUG_ASSERT( pr );
1434 
1435  if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1436  *p_lb = 0;
1437  *p_ub = 0;
1438 // if ( p_last != NULL )
1439 // *p_last = 0;
1440  if ( p_st != NULL )
1441  *p_st = 0;
1442  if ( __kmp_env_consistency_check ) {
1443  if ( pr->pushed_ws != ct_none ) {
1444  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1445  }
1446  }
1447  } else if ( pr->nomerge ) {
1448  kmp_int32 last;
1449  T start;
1450  UT limit, trip, init;
1451  ST incr;
1452  T chunk = pr->u.p.parm1;
1453 
1454  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1455 
1456  init = chunk * pr->u.p.count++;
1457  trip = pr->u.p.tc - 1;
1458 
1459  if ( (status = (init <= trip)) == 0 ) {
1460  *p_lb = 0;
1461  *p_ub = 0;
1462 // if ( p_last != NULL )
1463 // *p_last = 0;
1464  if ( p_st != NULL )
1465  *p_st = 0;
1466  if ( __kmp_env_consistency_check ) {
1467  if ( pr->pushed_ws != ct_none ) {
1468  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1469  }
1470  }
1471  } else {
1472  start = pr->u.p.lb;
1473  limit = chunk + init - 1;
1474  incr = pr->u.p.st;
1475 
1476  if ( (last = (limit >= trip)) != 0 ) {
1477  limit = trip;
1478  #if KMP_OS_WINDOWS
1479  pr->u.p.last_upper = pr->u.p.ub;
1480  #endif /* KMP_OS_WINDOWS */
1481  }
1482  if ( p_last != NULL )
1483  *p_last = last;
1484  if ( p_st != NULL )
1485  *p_st = incr;
1486  if ( incr == 1 ) {
1487  *p_lb = start + init;
1488  *p_ub = start + limit;
1489  } else {
1490  *p_lb = start + init * incr;
1491  *p_ub = start + limit * incr;
1492  }
1493 
1494  if ( pr->ordered ) {
1495  pr->u.p.ordered_lower = init;
1496  pr->u.p.ordered_upper = limit;
1497  #ifdef KMP_DEBUG
1498  {
1499  const char * buff;
1500  // create format specifiers before the debug output
1501  buff = __kmp_str_format(
1502  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1503  traits_t< UT >::spec, traits_t< UT >::spec );
1504  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1505  __kmp_str_free( &buff );
1506  }
1507  #endif
1508  } // if
1509  } // if
1510  } else {
1511  pr->u.p.tc = 0;
1512  *p_lb = pr->u.p.lb;
1513  *p_ub = pr->u.p.ub;
1514  #if KMP_OS_WINDOWS
1515  pr->u.p.last_upper = *p_ub;
1516  #endif /* KMP_OS_WINDOWS */
1517  if ( p_last != NULL )
1518  *p_last = TRUE;
1519  if ( p_st != NULL )
1520  *p_st = pr->u.p.st;
1521  } // if
1522  #ifdef KMP_DEBUG
1523  {
1524  const char * buff;
1525  // create format specifiers before the debug output
1526  buff = __kmp_str_format(
1527  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1528  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1529  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1530  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1531  __kmp_str_free( &buff );
1532  }
1533  #endif
1534 #if INCLUDE_SSC_MARKS
1535  SSC_MARK_DISPATCH_NEXT();
1536 #endif
1537  OMPT_LOOP_END;
1538  return status;
1539  } else {
1540  kmp_int32 last = 0;
1541  dispatch_shared_info_template< UT > *sh;
1542  T start;
1543  ST incr;
1544  UT limit, trip, init;
1545 
1546  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1547  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1548 
1549  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1550  ( th->th.th_dispatch->th_dispatch_pr_current );
1551  KMP_DEBUG_ASSERT( pr );
1552  sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1553  ( th->th.th_dispatch->th_dispatch_sh_current );
1554  KMP_DEBUG_ASSERT( sh );
1555 
1556  if ( pr->u.p.tc == 0 ) {
1557  // zero trip count
1558  status = 0;
1559  } else {
1560  switch (pr->schedule) {
1561  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1562  case kmp_sch_static_steal:
1563  {
1564  T chunk = pr->u.p.parm1;
1565 
1566  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1567 
1568  trip = pr->u.p.tc - 1;
1569 
1570  if ( ___kmp_size_type > 4 ) {
1571  // Other threads do not look into the data of this thread,
1572  // so it's not necessary to make volatile casting.
1573  init = ( pr->u.p.count )++;
1574  status = ( init < (UT)pr->u.p.ub );
1575  } else {
1576  typedef union {
1577  struct {
1578  UT count;
1579  T ub;
1580  } p;
1581  kmp_int64 b;
1582  } union_i4;
1583  // All operations on 'count' or 'ub' must be combined atomically together.
1584  // stealing implemented only for 4-byte indexes
1585  {
1586  union_i4 vold, vnew;
1587  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1588  vnew = vold;
1589  vnew.p.count++;
1590  while( ! KMP_COMPARE_AND_STORE_ACQ64(
1591  ( volatile kmp_int64* )&pr->u.p.count,
1592  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1593  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1594  KMP_CPU_PAUSE();
1595  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1596  vnew = vold;
1597  vnew.p.count++;
1598  }
1599  vnew = vold;
1600  init = vnew.p.count;
1601  status = ( init < (UT)vnew.p.ub ) ;
1602  }
1603 
1604  if( !status ) {
1605  kmp_info_t **other_threads = team->t.t_threads;
1606  int while_limit = 10;
1607  int while_index = 0;
1608 
1609  // TODO: algorithm of searching for a victim
1610  // should be cleaned up and measured
1611  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1612  union_i4 vold, vnew;
1613  kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1614  T victimIdx = pr->u.p.parm4;
1615  T oldVictimIdx = victimIdx;
1616  dispatch_private_info_template< T > * victim;
1617 
1618  do {
1619  if( !victimIdx ) {
1620  victimIdx = team->t.t_nproc - 1;
1621  } else {
1622  --victimIdx;
1623  }
1624  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1625  ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1626  } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1627  // TODO: think about a proper place of this test
1628  if ( ( !victim ) ||
1629  ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1630  (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1631  // TODO: delay would be nice
1632  continue;
1633  // the victim is not ready yet to participate in stealing
1634  // because the victim is still in kmp_init_dispatch
1635  }
1636  if ( oldVictimIdx == victimIdx ) {
1637  break;
1638  }
1639  pr->u.p.parm4 = victimIdx;
1640 
1641  while( 1 ) {
1642  vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1643  vnew = vold;
1644 
1645  KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1646  if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1647  break;
1648  }
1649  vnew.p.ub -= (remaining >> 2);
1650  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1651  #pragma warning( push )
1652  // disable warning on pointless comparison of unsigned with 0
1653  #pragma warning( disable: 186 )
1654  KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1655  #pragma warning( pop )
1656  // TODO: Should this be acquire or release?
1657  if ( KMP_COMPARE_AND_STORE_ACQ64(
1658  ( volatile kmp_int64 * )&victim->u.p.count,
1659  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1660  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1661  status = 1;
1662  while_index = 0;
1663  // now update own count and ub
1664  #if KMP_ARCH_X86
1665  // stealing executed on non-KMP_ARCH_X86 only
1666  // Atomic 64-bit write on ia32 is
1667  // unavailable, so we do this in steps.
1668  // This code is not tested.
1669  init = vold.p.count;
1670  pr->u.p.ub = 0;
1671  pr->u.p.count = init + 1;
1672  pr->u.p.ub = vnew.p.count;
1673  #else
1674  init = vnew.p.ub;
1675  vold.p.count = init + 1;
1676  // TODO: is it safe and enough?
1677  *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1678  #endif // KMP_ARCH_X86
1679  break;
1680  } // if
1681  KMP_CPU_PAUSE();
1682  } // while (1)
1683  } // while
1684  } // if
1685  } // if
1686  if ( !status ) {
1687  *p_lb = 0;
1688  *p_ub = 0;
1689  if ( p_st != NULL ) *p_st = 0;
1690  } else {
1691  start = pr->u.p.parm2;
1692  init *= chunk;
1693  limit = chunk + init - 1;
1694  incr = pr->u.p.st;
1695 
1696  KMP_DEBUG_ASSERT(init <= trip);
1697  if ( (last = (limit >= trip)) != 0 )
1698  limit = trip;
1699  if ( p_st != NULL ) *p_st = incr;
1700 
1701  if ( incr == 1 ) {
1702  *p_lb = start + init;
1703  *p_ub = start + limit;
1704  } else {
1705  *p_lb = start + init * incr;
1706  *p_ub = start + limit * incr;
1707  }
1708 
1709  if ( pr->ordered ) {
1710  pr->u.p.ordered_lower = init;
1711  pr->u.p.ordered_upper = limit;
1712  #ifdef KMP_DEBUG
1713  {
1714  const char * buff;
1715  // create format specifiers before the debug output
1716  buff = __kmp_str_format(
1717  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1718  traits_t< UT >::spec, traits_t< UT >::spec );
1719  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1720  __kmp_str_free( &buff );
1721  }
1722  #endif
1723  } // if
1724  } // if
1725  break;
1726  } // case
1727  #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1728  case kmp_sch_static_balanced:
1729  {
1730  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1731  if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1732  pr->u.p.count = 1;
1733  *p_lb = pr->u.p.lb;
1734  *p_ub = pr->u.p.ub;
1735  last = pr->u.p.parm1;
1736  if ( p_st != NULL )
1737  *p_st = pr->u.p.st;
1738  } else { /* no iterations to do */
1739  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1740  }
1741  if ( pr->ordered ) {
1742  #ifdef KMP_DEBUG
1743  {
1744  const char * buff;
1745  // create format specifiers before the debug output
1746  buff = __kmp_str_format(
1747  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1748  traits_t< UT >::spec, traits_t< UT >::spec );
1749  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1750  __kmp_str_free( &buff );
1751  }
1752  #endif
1753  } // if
1754  } // case
1755  break;
1756  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1757  case kmp_sch_static_chunked:
1758  {
1759  T parm1;
1760 
1761  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1762  gtid ) );
1763  parm1 = pr->u.p.parm1;
1764 
1765  trip = pr->u.p.tc - 1;
1766  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1767 
1768  if ( (status = (init <= trip)) != 0 ) {
1769  start = pr->u.p.lb;
1770  incr = pr->u.p.st;
1771  limit = parm1 + init - 1;
1772 
1773  if ( (last = (limit >= trip)) != 0 )
1774  limit = trip;
1775 
1776  if ( p_st != NULL ) *p_st = incr;
1777 
1778  pr->u.p.count += team->t.t_nproc;
1779 
1780  if ( incr == 1 ) {
1781  *p_lb = start + init;
1782  *p_ub = start + limit;
1783  }
1784  else {
1785  *p_lb = start + init * incr;
1786  *p_ub = start + limit * incr;
1787  }
1788 
1789  if ( pr->ordered ) {
1790  pr->u.p.ordered_lower = init;
1791  pr->u.p.ordered_upper = limit;
1792  #ifdef KMP_DEBUG
1793  {
1794  const char * buff;
1795  // create format specifiers before the debug output
1796  buff = __kmp_str_format(
1797  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1798  traits_t< UT >::spec, traits_t< UT >::spec );
1799  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1800  __kmp_str_free( &buff );
1801  }
1802  #endif
1803  } // if
1804  } // if
1805  } // case
1806  break;
1807 
1808  case kmp_sch_dynamic_chunked:
1809  {
1810  T chunk = pr->u.p.parm1;
1811 
1812  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1813  gtid ) );
1814 
1815  init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1816  trip = pr->u.p.tc - 1;
1817 
1818  if ( (status = (init <= trip)) == 0 ) {
1819  *p_lb = 0;
1820  *p_ub = 0;
1821  if ( p_st != NULL ) *p_st = 0;
1822  } else {
1823  start = pr->u.p.lb;
1824  limit = chunk + init - 1;
1825  incr = pr->u.p.st;
1826 
1827  if ( (last = (limit >= trip)) != 0 )
1828  limit = trip;
1829 
1830  if ( p_st != NULL ) *p_st = incr;
1831 
1832  if ( incr == 1 ) {
1833  *p_lb = start + init;
1834  *p_ub = start + limit;
1835  } else {
1836  *p_lb = start + init * incr;
1837  *p_ub = start + limit * incr;
1838  }
1839 
1840  if ( pr->ordered ) {
1841  pr->u.p.ordered_lower = init;
1842  pr->u.p.ordered_upper = limit;
1843  #ifdef KMP_DEBUG
1844  {
1845  const char * buff;
1846  // create format specifiers before the debug output
1847  buff = __kmp_str_format(
1848  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1849  traits_t< UT >::spec, traits_t< UT >::spec );
1850  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1851  __kmp_str_free( &buff );
1852  }
1853  #endif
1854  } // if
1855  } // if
1856  } // case
1857  break;
1858 
1859  case kmp_sch_guided_iterative_chunked:
1860  {
1861  T chunkspec = pr->u.p.parm1;
1862  KD_TRACE(100,
1863  ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1864  trip = pr->u.p.tc;
1865  // Start atomic part of calculations
1866  while(1) {
1867  ST remaining; // signed, because can be < 0
1868  init = sh->u.s.iteration; // shared value
1869  remaining = trip - init;
1870  if ( remaining <= 0 ) { // AC: need to compare with 0 first
1871  // nothing to do, don't try atomic op
1872  status = 0;
1873  break;
1874  }
1875  if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1876  // use dynamic-style shcedule
1877  // atomically inrement iterations, get old value
1878  init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1879  remaining = trip - init;
1880  if (remaining <= 0) {
1881  status = 0; // all iterations got by other threads
1882  } else {
1883  // got some iterations to work on
1884  status = 1;
1885  if ( (T)remaining > chunkspec ) {
1886  limit = init + chunkspec - 1;
1887  } else {
1888  last = 1; // the last chunk
1889  limit = init + remaining - 1;
1890  } // if
1891  } // if
1892  break;
1893  } // if
1894  limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1895  if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1896  // CAS was successful, chunk obtained
1897  status = 1;
1898  --limit;
1899  break;
1900  } // if
1901  } // while
1902  if ( status != 0 ) {
1903  start = pr->u.p.lb;
1904  incr = pr->u.p.st;
1905  if ( p_st != NULL )
1906  *p_st = incr;
1907  *p_lb = start + init * incr;
1908  *p_ub = start + limit * incr;
1909  if ( pr->ordered ) {
1910  pr->u.p.ordered_lower = init;
1911  pr->u.p.ordered_upper = limit;
1912  #ifdef KMP_DEBUG
1913  {
1914  const char * buff;
1915  // create format specifiers before the debug output
1916  buff = __kmp_str_format(
1917  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1918  traits_t< UT >::spec, traits_t< UT >::spec );
1919  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1920  __kmp_str_free( &buff );
1921  }
1922  #endif
1923  } // if
1924  } else {
1925  *p_lb = 0;
1926  *p_ub = 0;
1927  if ( p_st != NULL )
1928  *p_st = 0;
1929  } // if
1930  } // case
1931  break;
1932 
1933  case kmp_sch_guided_analytical_chunked:
1934  {
1935  T chunkspec = pr->u.p.parm1;
1936  UT chunkIdx;
1937  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1938  /* for storing original FPCW value for Windows* OS on
1939  IA-32 architecture 8-byte version */
1940  unsigned int oldFpcw;
1941  unsigned int fpcwSet = 0;
1942  #endif
1943  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1944  gtid ) );
1945 
1946  trip = pr->u.p.tc;
1947 
1948  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1949  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1950 
1951  while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1952  chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1953  if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1954  --trip;
1955  /* use dynamic-style scheduling */
1956  init = chunkIdx * chunkspec + pr->u.p.count;
1957  /* need to verify init > 0 in case of overflow in the above calculation */
1958  if ( (status = (init > 0 && init <= trip)) != 0 ) {
1959  limit = init + chunkspec -1;
1960 
1961  if ( (last = (limit >= trip)) != 0 )
1962  limit = trip;
1963  }
1964  break;
1965  } else {
1966  /* use exponential-style scheduling */
1967  /* The following check is to workaround the lack of long double precision on Windows* OS.
1968  This check works around the possible effect that init != 0 for chunkIdx == 0.
1969  */
1970  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1971  /* If we haven't already done so, save original
1972  FPCW and set precision to 64-bit, as Windows* OS
1973  on IA-32 architecture defaults to 53-bit */
1974  if ( !fpcwSet ) {
1975  oldFpcw = _control87(0,0);
1976  _control87(_PC_64,_MCW_PC);
1977  fpcwSet = 0x30000;
1978  }
1979  #endif
1980  if ( chunkIdx ) {
1981  init = __kmp_dispatch_guided_remaining< T >(
1982  trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1983  KMP_DEBUG_ASSERT(init);
1984  init = trip - init;
1985  } else
1986  init = 0;
1987  limit = trip - __kmp_dispatch_guided_remaining< T >(
1988  trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1989  KMP_ASSERT(init <= limit);
1990  if ( init < limit ) {
1991  KMP_DEBUG_ASSERT(limit <= trip);
1992  --limit;
1993  status = 1;
1994  break;
1995  } // if
1996  } // if
1997  } // while (1)
1998  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1999  /* restore FPCW if necessary
2000  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2001  */
2002  if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2003  _control87(oldFpcw,_MCW_PC);
2004  #endif
2005  if ( status != 0 ) {
2006  start = pr->u.p.lb;
2007  incr = pr->u.p.st;
2008  if ( p_st != NULL )
2009  *p_st = incr;
2010  *p_lb = start + init * incr;
2011  *p_ub = start + limit * incr;
2012  if ( pr->ordered ) {
2013  pr->u.p.ordered_lower = init;
2014  pr->u.p.ordered_upper = limit;
2015  #ifdef KMP_DEBUG
2016  {
2017  const char * buff;
2018  // create format specifiers before the debug output
2019  buff = __kmp_str_format(
2020  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2021  traits_t< UT >::spec, traits_t< UT >::spec );
2022  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2023  __kmp_str_free( &buff );
2024  }
2025  #endif
2026  }
2027  } else {
2028  *p_lb = 0;
2029  *p_ub = 0;
2030  if ( p_st != NULL )
2031  *p_st = 0;
2032  }
2033  } // case
2034  break;
2035 
2036  case kmp_sch_trapezoidal:
2037  {
2038  UT index;
2039  T parm2 = pr->u.p.parm2;
2040  T parm3 = pr->u.p.parm3;
2041  T parm4 = pr->u.p.parm4;
2042  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2043  gtid ) );
2044 
2045  index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2046 
2047  init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2048  trip = pr->u.p.tc - 1;
2049 
2050  if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2051  *p_lb = 0;
2052  *p_ub = 0;
2053  if ( p_st != NULL ) *p_st = 0;
2054  } else {
2055  start = pr->u.p.lb;
2056  limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2057  incr = pr->u.p.st;
2058 
2059  if ( (last = (limit >= trip)) != 0 )
2060  limit = trip;
2061 
2062  if ( p_st != NULL ) *p_st = incr;
2063 
2064  if ( incr == 1 ) {
2065  *p_lb = start + init;
2066  *p_ub = start + limit;
2067  } else {
2068  *p_lb = start + init * incr;
2069  *p_ub = start + limit * incr;
2070  }
2071 
2072  if ( pr->ordered ) {
2073  pr->u.p.ordered_lower = init;
2074  pr->u.p.ordered_upper = limit;
2075  #ifdef KMP_DEBUG
2076  {
2077  const char * buff;
2078  // create format specifiers before the debug output
2079  buff = __kmp_str_format(
2080  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2081  traits_t< UT >::spec, traits_t< UT >::spec );
2082  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2083  __kmp_str_free( &buff );
2084  }
2085  #endif
2086  } // if
2087  } // if
2088  } // case
2089  break;
2090  default:
2091  {
2092  status = 0; // to avoid complaints on uninitialized variable use
2093  __kmp_msg(
2094  kmp_ms_fatal, // Severity
2095  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2096  KMP_HNT( GetNewerLibrary ), // Hint
2097  __kmp_msg_null // Variadic argument list terminator
2098  );
2099  }
2100  break;
2101  } // switch
2102  } // if tc == 0;
2103 
2104  if ( status == 0 ) {
2105  UT num_done;
2106 
2107  num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2108  #ifdef KMP_DEBUG
2109  {
2110  const char * buff;
2111  // create format specifiers before the debug output
2112  buff = __kmp_str_format(
2113  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2114  traits_t< UT >::spec );
2115  KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2116  __kmp_str_free( &buff );
2117  }
2118  #endif
2119 
2120  if ( (ST)num_done == team->t.t_nproc-1 ) {
2121  /* NOTE: release this buffer to be reused */
2122 
2123  KMP_MB(); /* Flush all pending memory write invalidates. */
2124 
2125  sh->u.s.num_done = 0;
2126  sh->u.s.iteration = 0;
2127 
2128  /* TODO replace with general release procedure? */
2129  if ( pr->ordered ) {
2130  sh->u.s.ordered_iteration = 0;
2131  }
2132 
2133  KMP_MB(); /* Flush all pending memory write invalidates. */
2134 
2135  sh -> buffer_index += KMP_MAX_DISP_BUF;
2136  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2137  gtid, sh->buffer_index) );
2138 
2139  KMP_MB(); /* Flush all pending memory write invalidates. */
2140 
2141  } // if
2142  if ( __kmp_env_consistency_check ) {
2143  if ( pr->pushed_ws != ct_none ) {
2144  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2145  }
2146  }
2147 
2148  th -> th.th_dispatch -> th_deo_fcn = NULL;
2149  th -> th.th_dispatch -> th_dxo_fcn = NULL;
2150  th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2151  th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2152  } // if (status == 0)
2153 #if KMP_OS_WINDOWS
2154  else if ( last ) {
2155  pr->u.p.last_upper = pr->u.p.ub;
2156  }
2157 #endif /* KMP_OS_WINDOWS */
2158  if ( p_last != NULL && status != 0 )
2159  *p_last = last;
2160  } // if
2161 
2162  #ifdef KMP_DEBUG
2163  {
2164  const char * buff;
2165  // create format specifiers before the debug output
2166  buff = __kmp_str_format(
2167  "__kmp_dispatch_next: T#%%d normal case: " \
2168  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2169  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2170  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2171  __kmp_str_free( &buff );
2172  }
2173  #endif
2174 #if INCLUDE_SSC_MARKS
2175  SSC_MARK_DISPATCH_NEXT();
2176 #endif
2177  OMPT_LOOP_END;
2178  return status;
2179 }
2180 
2181 template< typename T >
2182 static void
2183 __kmp_dist_get_bounds(
2184  ident_t *loc,
2185  kmp_int32 gtid,
2186  kmp_int32 *plastiter,
2187  T *plower,
2188  T *pupper,
2189  typename traits_t< T >::signed_t incr
2190 ) {
2191  typedef typename traits_t< T >::unsigned_t UT;
2192  typedef typename traits_t< T >::signed_t ST;
2193  register kmp_uint32 team_id;
2194  register kmp_uint32 nteams;
2195  register UT trip_count;
2196  register kmp_team_t *team;
2197  kmp_info_t * th;
2198 
2199  KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2200  KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2201  #ifdef KMP_DEBUG
2202  {
2203  const char * buff;
2204  // create format specifiers before the debug output
2205  buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2206  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2207  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2208  traits_t< T >::spec );
2209  KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2210  __kmp_str_free( &buff );
2211  }
2212  #endif
2213 
2214  if( __kmp_env_consistency_check ) {
2215  if( incr == 0 ) {
2216  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2217  }
2218  if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2219  // The loop is illegal.
2220  // Some zero-trip loops maintained by compiler, e.g.:
2221  // for(i=10;i<0;++i) // lower >= upper - run-time check
2222  // for(i=0;i>10;--i) // lower <= upper - run-time check
2223  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2224  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2225  // Compiler does not check the following illegal loops:
2226  // for(i=0;i<10;i+=incr) // where incr<0
2227  // for(i=10;i>0;i-=incr) // where incr<0
2228  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2229  }
2230  }
2231  th = __kmp_threads[gtid];
2232  team = th->th.th_team;
2233  #if OMP_40_ENABLED
2234  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2235  nteams = th->th.th_teams_size.nteams;
2236  #endif
2237  team_id = team->t.t_master_tid;
2238  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2239 
2240  // compute global trip count
2241  if( incr == 1 ) {
2242  trip_count = *pupper - *plower + 1;
2243  } else if(incr == -1) {
2244  trip_count = *plower - *pupper + 1;
2245  } else {
2246  trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2247  }
2248 
2249  if( trip_count <= nteams ) {
2250  KMP_DEBUG_ASSERT(
2251  __kmp_static == kmp_sch_static_greedy || \
2252  __kmp_static == kmp_sch_static_balanced
2253  ); // Unknown static scheduling type.
2254  // only some teams get single iteration, others get nothing
2255  if( team_id < trip_count ) {
2256  *pupper = *plower = *plower + team_id * incr;
2257  } else {
2258  *plower = *pupper + incr; // zero-trip loop
2259  }
2260  if( plastiter != NULL )
2261  *plastiter = ( team_id == trip_count - 1 );
2262  } else {
2263  if( __kmp_static == kmp_sch_static_balanced ) {
2264  register UT chunk = trip_count / nteams;
2265  register UT extras = trip_count % nteams;
2266  *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2267  *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2268  if( plastiter != NULL )
2269  *plastiter = ( team_id == nteams - 1 );
2270  } else {
2271  register T chunk_inc_count =
2272  ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2273  register T upper = *pupper;
2274  KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2275  // Unknown static scheduling type.
2276  *plower += team_id * chunk_inc_count;
2277  *pupper = *plower + chunk_inc_count - incr;
2278  // Check/correct bounds if needed
2279  if( incr > 0 ) {
2280  if( *pupper < *plower )
2281  *pupper = i_maxmin< T >::mx;
2282  if( plastiter != NULL )
2283  *plastiter = *plower <= upper && *pupper > upper - incr;
2284  if( *pupper > upper )
2285  *pupper = upper; // tracker C73258
2286  } else {
2287  if( *pupper > *plower )
2288  *pupper = i_maxmin< T >::mn;
2289  if( plastiter != NULL )
2290  *plastiter = *plower >= upper && *pupper < upper - incr;
2291  if( *pupper < upper )
2292  *pupper = upper; // tracker C73258
2293  }
2294  }
2295  }
2296 }
2297 
2298 //-----------------------------------------------------------------------------------------
2299 // Dispatch routines
2300 // Transfer call to template< type T >
2301 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2302 // T lb, T ub, ST st, ST chunk )
2303 extern "C" {
2304 
2320 void
2321 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2322  kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2323 {
2324  KMP_DEBUG_ASSERT( __kmp_init_serial );
2325  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2326 }
2330 void
2331 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2332  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2333 {
2334  KMP_DEBUG_ASSERT( __kmp_init_serial );
2335  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2336 }
2337 
2341 void
2342 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2343  kmp_int64 lb, kmp_int64 ub,
2344  kmp_int64 st, kmp_int64 chunk )
2345 {
2346  KMP_DEBUG_ASSERT( __kmp_init_serial );
2347  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2348 }
2349 
2353 void
2354 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2355  kmp_uint64 lb, kmp_uint64 ub,
2356  kmp_int64 st, kmp_int64 chunk )
2357 {
2358  KMP_DEBUG_ASSERT( __kmp_init_serial );
2359  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2360 }
2361 
2371 void
2372 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2373  kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2374 {
2375  KMP_DEBUG_ASSERT( __kmp_init_serial );
2376  __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2377  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2378 }
2379 
2380 void
2381 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2382  kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2383 {
2384  KMP_DEBUG_ASSERT( __kmp_init_serial );
2385  __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2386  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2387 }
2388 
2389 void
2390 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2391  kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2392 {
2393  KMP_DEBUG_ASSERT( __kmp_init_serial );
2394  __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2395  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2396 }
2397 
2398 void
2399 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2400  kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2401 {
2402  KMP_DEBUG_ASSERT( __kmp_init_serial );
2403  __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2404  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2405 }
2406 
2419 int
2420 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2421  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2422 {
2423  return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2424 }
2425 
2429 int
2430 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2431  kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2432 {
2433  return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2434 }
2435 
2439 int
2440 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2441  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2442 {
2443  return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2444 }
2445 
2449 int
2450 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2451  kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2452 {
2453  return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2454 }
2455 
2462 void
2463 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2464 {
2465  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2466 }
2467 
2471 void
2472 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2473 {
2474  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2475 }
2476 
2480 void
2481 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2482 {
2483  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2484 }
2485 
2489 void
2490 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2491 {
2492  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2493 }
2496 //-----------------------------------------------------------------------------------------
2497 //Non-template routines from kmp_dispatch.c used in other sources
2498 
2499 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2500  return value == checker;
2501 }
2502 
2503 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2504  return value != checker;
2505 }
2506 
2507 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2508  return value < checker;
2509 }
2510 
2511 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2512  return value >= checker;
2513 }
2514 
2515 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2516  return value <= checker;
2517 }
2518 
2519 kmp_uint32
2520 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2521  kmp_uint32 checker,
2522  kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2523  , void * obj // Higher-level synchronization object, or NULL.
2524  )
2525 {
2526  // note: we may not belong to a team at this point
2527  register volatile kmp_uint32 * spin = spinner;
2528  register kmp_uint32 check = checker;
2529  register kmp_uint32 spins;
2530  register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2531  register kmp_uint32 r;
2532 
2533  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2534  KMP_INIT_YIELD( spins );
2535  // main wait spin loop
2536  while(!f(r = TCR_4(*spin), check)) {
2537  KMP_FSYNC_SPIN_PREPARE( obj );
2538  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2539  It causes problems with infinite recursion because of exit lock */
2540  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2541  __kmp_abort_thread(); */
2542 
2543  /* if we have waited a bit, or are oversubscribed, yield */
2544  /* pause is in the following code */
2545  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2546  KMP_YIELD_SPIN( spins );
2547  }
2548  KMP_FSYNC_SPIN_ACQUIRED( obj );
2549  return r;
2550 }
2551 
2552 } // extern "C"
2553 
2554 #ifdef KMP_GOMP_COMPAT
2555 
2556 void
2557 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2558  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2559  kmp_int32 chunk, int push_ws )
2560 {
2561  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2562  push_ws );
2563 }
2564 
2565 void
2566 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2567  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2568  kmp_int32 chunk, int push_ws )
2569 {
2570  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2571  push_ws );
2572 }
2573 
2574 void
2575 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2576  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2577  kmp_int64 chunk, int push_ws )
2578 {
2579  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2580  push_ws );
2581 }
2582 
2583 void
2584 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2585  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2586  kmp_int64 chunk, int push_ws )
2587 {
2588  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2589  push_ws );
2590 }
2591 
2592 void
2593 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2594 {
2595  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2596 }
2597 
2598 void
2599 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2600 {
2601  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2602 }
2603 
2604 void
2605 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2606 {
2607  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2608 }
2609 
2610 void
2611 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2612 {
2613  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2614 }
2615 
2616 #endif /* KMP_GOMP_COMPAT */
2617 
2618 /* ------------------------------------------------------------------------ */
2619 /* ------------------------------------------------------------------------ */
2620 
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:632
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:645
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:657
Definition: kmp.h:200
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
sched_type
Definition: kmp.h:302
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)