LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  * it may change values between parallel regions. __kmp_max_nth
21  * is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 // Need to raise Win version from XP to Vista here for support of InterlockedExchange64
29 #if defined(_WIN32_WINNT) && defined(_M_IX86)
30 #undef _WIN32_WINNT
31 #define _WIN32_WINNT 0x0502
32 #endif
33 
34 #include "kmp.h"
35 #include "kmp_i18n.h"
36 #include "kmp_itt.h"
37 #include "kmp_str.h"
38 #include "kmp_error.h"
39 #include "kmp_stats.h"
40 #if KMP_OS_WINDOWS && KMP_ARCH_X86
41  #include <float.h>
42 #endif
43 
44 #if OMPT_SUPPORT
45 #include "ompt-internal.h"
46 #include "ompt-specific.h"
47 #endif
48 
49 /* ------------------------------------------------------------------------ */
50 /* ------------------------------------------------------------------------ */
51 
52 // template for type limits
53 template< typename T >
54 struct i_maxmin {
55  static const T mx;
56  static const T mn;
57 };
58 template<>
59 struct i_maxmin< int > {
60  static const int mx = 0x7fffffff;
61  static const int mn = 0x80000000;
62 };
63 template<>
64 struct i_maxmin< unsigned int > {
65  static const unsigned int mx = 0xffffffff;
66  static const unsigned int mn = 0x00000000;
67 };
68 template<>
69 struct i_maxmin< long long > {
70  static const long long mx = 0x7fffffffffffffffLL;
71  static const long long mn = 0x8000000000000000LL;
72 };
73 template<>
74 struct i_maxmin< unsigned long long > {
75  static const unsigned long long mx = 0xffffffffffffffffLL;
76  static const unsigned long long mn = 0x0000000000000000LL;
77 };
78 //-------------------------------------------------------------------------
79 
80 #if KMP_STATIC_STEAL_ENABLED
81 
82  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
83  template< typename T >
84  struct dispatch_private_infoXX_template {
85  typedef typename traits_t< T >::unsigned_t UT;
86  typedef typename traits_t< T >::signed_t ST;
87  UT count; // unsigned
88  T ub;
89  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
90  T lb;
91  ST st; // signed
92  UT tc; // unsigned
93  T static_steal_counter; // for static_steal only; maybe better to put after ub
94 
95  /* parm[1-4] are used in different ways by different scheduling algorithms */
96 
97  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
98  // a) parm3 is properly aligned and
99  // b) all parm1-4 are in the same cache line.
100  // Because of parm1-4 are used together, performance seems to be better
101  // if they are in the same line (not measured though).
102 
103  struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
104  T parm1;
105  T parm2;
106  T parm3;
107  T parm4;
108  };
109 
110  UT ordered_lower; // unsigned
111  UT ordered_upper; // unsigned
112  #if KMP_OS_WINDOWS
113  T last_upper;
114  #endif /* KMP_OS_WINDOWS */
115  };
116 
117 #else /* KMP_STATIC_STEAL_ENABLED */
118 
119  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
120  template< typename T >
121  struct dispatch_private_infoXX_template {
122  typedef typename traits_t< T >::unsigned_t UT;
123  typedef typename traits_t< T >::signed_t ST;
124  T lb;
125  T ub;
126  ST st; // signed
127  UT tc; // unsigned
128 
129  T parm1;
130  T parm2;
131  T parm3;
132  T parm4;
133 
134  UT count; // unsigned
135 
136  UT ordered_lower; // unsigned
137  UT ordered_upper; // unsigned
138  #if KMP_OS_WINDOWS
139  T last_upper;
140  #endif /* KMP_OS_WINDOWS */
141  };
142 
143 #endif /* KMP_STATIC_STEAL_ENABLED */
144 
145 // replaces dispatch_private_info structure and dispatch_private_info_t type
146 template< typename T >
147 struct KMP_ALIGN_CACHE dispatch_private_info_template {
148  // duplicate alignment here, otherwise size of structure is not correct in our compiler
149  union KMP_ALIGN_CACHE private_info_tmpl {
150  dispatch_private_infoXX_template< T > p;
151  dispatch_private_info64_t p64;
152  } u;
153  enum sched_type schedule; /* scheduling algorithm */
154  kmp_uint32 ordered; /* ordered clause specified */
155  kmp_uint32 ordered_bumped;
156  kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
157  dispatch_private_info * next; /* stack of buffers for nest of serial regions */
158  kmp_uint32 nomerge; /* don't merge iters if serialized */
159  kmp_uint32 type_size;
160  enum cons_type pushed_ws;
161 };
162 
163 
164 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
165 template< typename UT >
166 struct dispatch_shared_infoXX_template {
167  /* chunk index under dynamic, number of idle threads under static-steal;
168  iteration index otherwise */
169  volatile UT iteration;
170  volatile UT num_done;
171  volatile UT ordered_iteration;
172  UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
173 };
174 
175 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
176 template< typename UT >
177 struct dispatch_shared_info_template {
178  // we need union here to keep the structure size
179  union shared_info_tmpl {
180  dispatch_shared_infoXX_template< UT > s;
181  dispatch_shared_info64_t s64;
182  } u;
183  volatile kmp_uint32 buffer_index;
184 #if OMP_45_ENABLED
185  volatile kmp_int32 doacross_buf_idx; // teamwise index
186  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
187  kmp_int32 doacross_num_done; // count finished threads
188 #endif
189 #if KMP_USE_HWLOC
190  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
191  // machines (> 48 cores). Performance analysis showed that a cache thrash
192  // was occurring and this padding helps alleviate the problem.
193  char padding[64];
194 #endif
195 };
196 
197 /* ------------------------------------------------------------------------ */
198 /* ------------------------------------------------------------------------ */
199 
200 #undef USE_TEST_LOCKS
201 
202 // test_then_add template (general template should NOT be used)
203 template< typename T >
204 static __forceinline T
205 test_then_add( volatile T *p, T d );
206 
207 template<>
208 __forceinline kmp_int32
209 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
210 {
211  kmp_int32 r;
212  r = KMP_TEST_THEN_ADD32( p, d );
213  return r;
214 }
215 
216 template<>
217 __forceinline kmp_int64
218 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
219 {
220  kmp_int64 r;
221  r = KMP_TEST_THEN_ADD64( p, d );
222  return r;
223 }
224 
225 // test_then_inc_acq template (general template should NOT be used)
226 template< typename T >
227 static __forceinline T
228 test_then_inc_acq( volatile T *p );
229 
230 template<>
231 __forceinline kmp_int32
232 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
233 {
234  kmp_int32 r;
235  r = KMP_TEST_THEN_INC_ACQ32( p );
236  return r;
237 }
238 
239 template<>
240 __forceinline kmp_int64
241 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
242 {
243  kmp_int64 r;
244  r = KMP_TEST_THEN_INC_ACQ64( p );
245  return r;
246 }
247 
248 // test_then_inc template (general template should NOT be used)
249 template< typename T >
250 static __forceinline T
251 test_then_inc( volatile T *p );
252 
253 template<>
254 __forceinline kmp_int32
255 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
256 {
257  kmp_int32 r;
258  r = KMP_TEST_THEN_INC32( p );
259  return r;
260 }
261 
262 template<>
263 __forceinline kmp_int64
264 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
265 {
266  kmp_int64 r;
267  r = KMP_TEST_THEN_INC64( p );
268  return r;
269 }
270 
271 // compare_and_swap template (general template should NOT be used)
272 template< typename T >
273 static __forceinline kmp_int32
274 compare_and_swap( volatile T *p, T c, T s );
275 
276 template<>
277 __forceinline kmp_int32
278 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
279 {
280  return KMP_COMPARE_AND_STORE_REL32( p, c, s );
281 }
282 
283 template<>
284 __forceinline kmp_int32
285 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
286 {
287  return KMP_COMPARE_AND_STORE_REL64( p, c, s );
288 }
289 
290 /*
291  Spin wait loop that first does pause, then yield.
292  Waits until function returns non-zero when called with *spinner and check.
293  Does NOT put threads to sleep.
294 #if USE_ITT_BUILD
295  Arguments:
296  obj -- is higher-level synchronization object to report to ittnotify. It is used to report
297  locks consistently. For example, if lock is acquired immediately, its address is
298  reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
299  immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
300  address, not an address of low-level spinner.
301 #endif // USE_ITT_BUILD
302 */
303 template< typename UT >
304 // ToDo: make inline function (move to header file for icl)
305 static UT // unsigned 4- or 8-byte type
306 __kmp_wait_yield( volatile UT * spinner,
307  UT checker,
308  kmp_uint32 (* pred)( UT, UT )
309  USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
310  )
311 {
312  // note: we may not belong to a team at this point
313  register volatile UT * spin = spinner;
314  register UT check = checker;
315  register kmp_uint32 spins;
316  register kmp_uint32 (*f) ( UT, UT ) = pred;
317  register UT r;
318 
319  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
320  KMP_INIT_YIELD( spins );
321  // main wait spin loop
322  while(!f(r = *spin, check))
323  {
324  KMP_FSYNC_SPIN_PREPARE( obj );
325  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
326  It causes problems with infinite recursion because of exit lock */
327  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
328  __kmp_abort_thread(); */
329 
330  // if we are oversubscribed,
331  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
332  // pause is in the following code
333  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
334  KMP_YIELD_SPIN( spins );
335  }
336  KMP_FSYNC_SPIN_ACQUIRED( obj );
337  return r;
338 }
339 
340 template< typename UT >
341 static kmp_uint32 __kmp_eq( UT value, UT checker) {
342  return value == checker;
343 }
344 
345 template< typename UT >
346 static kmp_uint32 __kmp_neq( UT value, UT checker) {
347  return value != checker;
348 }
349 
350 template< typename UT >
351 static kmp_uint32 __kmp_lt( UT value, UT checker) {
352  return value < checker;
353 }
354 
355 template< typename UT >
356 static kmp_uint32 __kmp_ge( UT value, UT checker) {
357  return value >= checker;
358 }
359 
360 template< typename UT >
361 static kmp_uint32 __kmp_le( UT value, UT checker) {
362  return value <= checker;
363 }
364 
365 
366 /* ------------------------------------------------------------------------ */
367 /* ------------------------------------------------------------------------ */
368 
369 static void
370 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
371 {
372  kmp_info_t *th;
373 
374  KMP_DEBUG_ASSERT( gtid_ref );
375 
376  if ( __kmp_env_consistency_check ) {
377  th = __kmp_threads[*gtid_ref];
378  if ( th -> th.th_root -> r.r_active
379  && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
380 #if KMP_USE_DYNAMIC_LOCK
381  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
382 #else
383  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
384 #endif
385  }
386  }
387 }
388 
389 template< typename UT >
390 static void
391 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
392 {
393  typedef typename traits_t< UT >::signed_t ST;
394  dispatch_private_info_template< UT > * pr;
395 
396  int gtid = *gtid_ref;
397 // int cid = *cid_ref;
398  kmp_info_t *th = __kmp_threads[ gtid ];
399  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
400 
401  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
402  if ( __kmp_env_consistency_check ) {
403  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404  ( th -> th.th_dispatch -> th_dispatch_pr_current );
405  if ( pr -> pushed_ws != ct_none ) {
406 #if KMP_USE_DYNAMIC_LOCK
407  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
408 #else
409  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
410 #endif
411  }
412  }
413 
414  if ( ! th -> th.th_team -> t.t_serialized ) {
415  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
416  ( th -> th.th_dispatch -> th_dispatch_sh_current );
417  UT lower;
418 
419  if ( ! __kmp_env_consistency_check ) {
420  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
421  ( th -> th.th_dispatch -> th_dispatch_pr_current );
422  }
423  lower = pr->u.p.ordered_lower;
424 
425  #if ! defined( KMP_GOMP_COMPAT )
426  if ( __kmp_env_consistency_check ) {
427  if ( pr->ordered_bumped ) {
428  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
429  __kmp_error_construct2(
430  kmp_i18n_msg_CnsMultipleNesting,
431  ct_ordered_in_pdo, loc_ref,
432  & p->stack_data[ p->w_top ]
433  );
434  }
435  }
436  #endif /* !defined(KMP_GOMP_COMPAT) */
437 
438  KMP_MB();
439  #ifdef KMP_DEBUG
440  {
441  const char * buff;
442  // create format specifiers before the debug output
443  buff = __kmp_str_format(
444  "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
445  traits_t< UT >::spec, traits_t< UT >::spec );
446  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
447  __kmp_str_free( &buff );
448  }
449  #endif
450 
451  __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
452  USE_ITT_BUILD_ARG( NULL )
453  );
454  KMP_MB(); /* is this necessary? */
455  #ifdef KMP_DEBUG
456  {
457  const char * buff;
458  // create format specifiers before the debug output
459  buff = __kmp_str_format(
460  "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
461  traits_t< UT >::spec, traits_t< UT >::spec );
462  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
463  __kmp_str_free( &buff );
464  }
465  #endif
466  }
467  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
468 }
469 
470 static void
471 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
472 {
473  kmp_info_t *th;
474 
475  if ( __kmp_env_consistency_check ) {
476  th = __kmp_threads[*gtid_ref];
477  if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
478  __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
479  }
480  }
481 }
482 
483 template< typename UT >
484 static void
485 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
486 {
487  typedef typename traits_t< UT >::signed_t ST;
488  dispatch_private_info_template< UT > * pr;
489 
490  int gtid = *gtid_ref;
491 // int cid = *cid_ref;
492  kmp_info_t *th = __kmp_threads[ gtid ];
493  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
494 
495  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
496  if ( __kmp_env_consistency_check ) {
497  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
498  ( th -> th.th_dispatch -> th_dispatch_pr_current );
499  if ( pr -> pushed_ws != ct_none ) {
500  __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
501  }
502  }
503 
504  if ( ! th -> th.th_team -> t.t_serialized ) {
505  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
506  ( th -> th.th_dispatch -> th_dispatch_sh_current );
507 
508  if ( ! __kmp_env_consistency_check ) {
509  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
510  ( th -> th.th_dispatch -> th_dispatch_pr_current );
511  }
512 
513  KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
514  #if ! defined( KMP_GOMP_COMPAT )
515  if ( __kmp_env_consistency_check ) {
516  if ( pr->ordered_bumped != 0 ) {
517  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
518  /* How to test it? - OM */
519  __kmp_error_construct2(
520  kmp_i18n_msg_CnsMultipleNesting,
521  ct_ordered_in_pdo, loc_ref,
522  & p->stack_data[ p->w_top ]
523  );
524  }
525  }
526  #endif /* !defined(KMP_GOMP_COMPAT) */
527 
528  KMP_MB(); /* Flush all pending memory write invalidates. */
529 
530  pr->ordered_bumped += 1;
531 
532  KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
533  gtid, pr->ordered_bumped ) );
534 
535  KMP_MB(); /* Flush all pending memory write invalidates. */
536 
537  /* TODO use general release procedure? */
538  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
539 
540  KMP_MB(); /* Flush all pending memory write invalidates. */
541  }
542  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
543 }
544 
545 /* Computes and returns x to the power of y, where y must a non-negative integer */
546 template< typename UT >
547 static __forceinline long double
548 __kmp_pow(long double x, UT y) {
549  long double s=1.0L;
550 
551  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
552  //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
553  while(y) {
554  if ( y & 1 )
555  s *= x;
556  x *= x;
557  y >>= 1;
558  }
559  return s;
560 }
561 
562 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
563  (the total number of unassigned iterations in chunks with index greater than or equal to idx).
564  __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
565  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
566 */
567 template< typename T >
568 static __inline typename traits_t< T >::unsigned_t
569 __kmp_dispatch_guided_remaining(
570  T tc,
571  typename traits_t< T >::floating_t base,
572  typename traits_t< T >::unsigned_t idx
573 ) {
574  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
575  least for ICL 8.1, long double arithmetic may not really have
576  long double precision, even with /Qlong_double. Currently, we
577  workaround that in the caller code, by manipulating the FPCW for
578  Windows* OS on IA-32 architecture. The lack of precision is not
579  expected to be a correctness issue, though.
580  */
581  typedef typename traits_t< T >::unsigned_t UT;
582 
583  long double x = tc * __kmp_pow< UT >(base, idx);
584  UT r = (UT) x;
585  if ( x == r )
586  return r;
587  return r + 1;
588 }
589 
590 // Parameters of the guided-iterative algorithm:
591 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
592 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
593 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
594 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
595 static int guided_int_param = 2;
596 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
597 
598 // UT - unsigned flavor of T, ST - signed flavor of T,
599 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
600 template< typename T >
601 static void
602 __kmp_dispatch_init(
603  ident_t * loc,
604  int gtid,
605  enum sched_type schedule,
606  T lb,
607  T ub,
608  typename traits_t< T >::signed_t st,
609  typename traits_t< T >::signed_t chunk,
610  int push_ws
611 ) {
612  typedef typename traits_t< T >::unsigned_t UT;
613  typedef typename traits_t< T >::signed_t ST;
614  typedef typename traits_t< T >::floating_t DBL;
615  static const int ___kmp_size_type = sizeof( UT );
616 
617  int active;
618  T tc;
619  kmp_info_t * th;
620  kmp_team_t * team;
621  kmp_uint32 my_buffer_index;
622  dispatch_private_info_template< T > * pr;
623  dispatch_shared_info_template< UT > volatile * sh;
624 
625  KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
626  KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
627 
628  if ( ! TCR_4( __kmp_init_parallel ) )
629  __kmp_parallel_initialize();
630 
631 #if INCLUDE_SSC_MARKS
632  SSC_MARK_DISPATCH_INIT();
633 #endif
634  #ifdef KMP_DEBUG
635  {
636  const char * buff;
637  // create format specifiers before the debug output
638  buff = __kmp_str_format(
639  "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
640  traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
641  KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
642  __kmp_str_free( &buff );
643  }
644  #endif
645  /* setup data */
646  th = __kmp_threads[ gtid ];
647  team = th -> th.th_team;
648  active = ! team -> t.t_serialized;
649  th->th.th_ident = loc;
650 
651 #if USE_ITT_BUILD
652  kmp_uint64 cur_chunk = chunk;
653  int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
654  KMP_MASTER_GTID(gtid) &&
655 #if OMP_40_ENABLED
656  th->th.th_teams_microtask == NULL &&
657 #endif
658  team->t.t_active_level == 1;
659 #endif
660  if ( ! active ) {
661  pr = reinterpret_cast< dispatch_private_info_template< T >* >
662  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
663  } else {
664  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
665  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
666 
667  my_buffer_index = th->th.th_dispatch->th_disp_index ++;
668 
669  /* What happens when number of threads changes, need to resize buffer? */
670  pr = reinterpret_cast< dispatch_private_info_template< T > * >
671  ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
672  sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
673  ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
674  }
675 
676  #if ( KMP_STATIC_STEAL_ENABLED )
677  if ( SCHEDULE_HAS_NONMONOTONIC(schedule) )
678  // AC: we now have only one implementation of stealing, so use it
679  schedule = kmp_sch_static_steal;
680  else
681  #endif
682  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
683 
684  /* Pick up the nomerge/ordered bits from the scheduling type */
685  if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
686  pr->nomerge = TRUE;
687  schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
688  } else {
689  pr->nomerge = FALSE;
690  }
691  pr->type_size = ___kmp_size_type; // remember the size of variables
692  if ( kmp_ord_lower & schedule ) {
693  pr->ordered = TRUE;
694  schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
695  } else {
696  pr->ordered = FALSE;
697  }
698 
699  if ( schedule == kmp_sch_static ) {
700  schedule = __kmp_static;
701  } else {
702  if ( schedule == kmp_sch_runtime ) {
703  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
704  schedule = team -> t.t_sched.r_sched_type;
705  // Detail the schedule if needed (global controls are differentiated appropriately)
706  if ( schedule == kmp_sch_guided_chunked ) {
707  schedule = __kmp_guided;
708  } else if ( schedule == kmp_sch_static ) {
709  schedule = __kmp_static;
710  }
711  // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
712  chunk = team -> t.t_sched.chunk;
713 #if USE_ITT_BUILD
714  cur_chunk = chunk;
715 #endif
716  #ifdef KMP_DEBUG
717  {
718  const char * buff;
719  // create format specifiers before the debug output
720  buff = __kmp_str_format(
721  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
722  traits_t< ST >::spec );
723  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
724  __kmp_str_free( &buff );
725  }
726  #endif
727  } else {
728  if ( schedule == kmp_sch_guided_chunked ) {
729  schedule = __kmp_guided;
730  }
731  if ( chunk <= 0 ) {
732  chunk = KMP_DEFAULT_CHUNK;
733  }
734  }
735 
736  if ( schedule == kmp_sch_auto ) {
737  // mapping and differentiation: in the __kmp_do_serial_initialize()
738  schedule = __kmp_auto;
739  #ifdef KMP_DEBUG
740  {
741  const char * buff;
742  // create format specifiers before the debug output
743  buff = __kmp_str_format(
744  "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
745  traits_t< ST >::spec );
746  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
747  __kmp_str_free( &buff );
748  }
749  #endif
750  }
751 
752  /* guided analytical not safe for too many threads */
753  if ( schedule == kmp_sch_guided_analytical_chunked && th->th.th_team_nproc > 1<<20 ) {
754  schedule = kmp_sch_guided_iterative_chunked;
755  KMP_WARNING( DispatchManyThreads );
756  }
757  pr->u.p.parm1 = chunk;
758  }
759  KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
760  "unknown scheduling type" );
761 
762  pr->u.p.count = 0;
763 
764  if ( __kmp_env_consistency_check ) {
765  if ( st == 0 ) {
766  __kmp_error_construct(
767  kmp_i18n_msg_CnsLoopIncrZeroProhibited,
768  ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
769  );
770  }
771  }
772  // compute trip count
773  if ( st == 1 ) { // most common case
774  if ( ub >= lb ) {
775  tc = ub - lb + 1;
776  } else { // ub < lb
777  tc = 0; // zero-trip
778  }
779  } else if ( st < 0 ) {
780  if ( lb >= ub ) {
781  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
782  // where the division needs to be unsigned regardless of the result type
783  tc = (UT)(lb - ub) / (-st) + 1;
784  } else { // lb < ub
785  tc = 0; // zero-trip
786  }
787  } else { // st > 0
788  if ( ub >= lb ) {
789  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
790  // where the division needs to be unsigned regardless of the result type
791  tc = (UT)(ub - lb) / st + 1;
792  } else { // ub < lb
793  tc = 0; // zero-trip
794  }
795  }
796 
797  // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
798  // when statistics are disabled.
799  if (schedule == __kmp_static)
800  {
801  KMP_COUNT_BLOCK(OMP_FOR_static);
802  KMP_COUNT_VALUE(FOR_static_iterations, tc);
803  }
804  else
805  {
806  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
807  KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
808  }
809 
810  pr->u.p.lb = lb;
811  pr->u.p.ub = ub;
812  pr->u.p.st = st;
813  pr->u.p.tc = tc;
814 
815  #if KMP_OS_WINDOWS
816  pr->u.p.last_upper = ub + st;
817  #endif /* KMP_OS_WINDOWS */
818 
819  /* NOTE: only the active parallel region(s) has active ordered sections */
820 
821  if ( active ) {
822  if ( pr->ordered == 0 ) {
823  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
824  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
825  } else {
826  pr->ordered_bumped = 0;
827 
828  pr->u.p.ordered_lower = 1;
829  pr->u.p.ordered_upper = 0;
830 
831  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
832  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
833  }
834  }
835 
836  if ( __kmp_env_consistency_check ) {
837  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
838  if ( push_ws ) {
839  __kmp_push_workshare( gtid, ws, loc );
840  pr->pushed_ws = ws;
841  } else {
842  __kmp_check_workshare( gtid, ws, loc );
843  pr->pushed_ws = ct_none;
844  }
845  }
846 
847  switch ( schedule ) {
848  #if ( KMP_STATIC_STEAL_ENABLED )
850  {
851  T nproc = th->th.th_team_nproc;
852  T ntc, init;
853 
854  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
855 
856  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
857  if ( nproc > 1 && ntc >= nproc ) {
858  T id = __kmp_tid_from_gtid(gtid);
859  T small_chunk, extras;
860 
861  small_chunk = ntc / nproc;
862  extras = ntc % nproc;
863 
864  init = id * small_chunk + ( id < extras ? id : extras );
865  pr->u.p.count = init;
866  pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
867 
868  pr->u.p.parm2 = lb;
869  //pr->pfields.parm3 = 0; // it's not used in static_steal
870  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
871  pr->u.p.st = st;
872  if ( ___kmp_size_type > 4 ) {
873  // AC: TODO: check if 16-byte CAS available and use it to
874  // improve performance (probably wait for explicit request
875  // before spending time on this).
876  // For now use dynamically allocated per-thread lock,
877  // free memory in __kmp_dispatch_next when status==0.
878  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
879  th->th.th_dispatch->th_steal_lock =
880  (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t));
881  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
882  }
883  break;
884  } else {
885  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
886  gtid ) );
887  schedule = kmp_sch_static_balanced;
888  /* too few iterations: fall-through to kmp_sch_static_balanced */
889  } // if
890  /* FALL-THROUGH to static balanced */
891  } // case
892  #endif
893  case kmp_sch_static_balanced:
894  {
895  T nproc = th->th.th_team_nproc;
896  T init, limit;
897 
898  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
899  gtid ) );
900 
901  if ( nproc > 1 ) {
902  T id = __kmp_tid_from_gtid(gtid);
903 
904  if ( tc < nproc ) {
905  if ( id < tc ) {
906  init = id;
907  limit = id;
908  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
909  } else {
910  pr->u.p.count = 1; /* means no more chunks to execute */
911  pr->u.p.parm1 = FALSE;
912  break;
913  }
914  } else {
915  T small_chunk = tc / nproc;
916  T extras = tc % nproc;
917  init = id * small_chunk + (id < extras ? id : extras);
918  limit = init + small_chunk - (id < extras ? 0 : 1);
919  pr->u.p.parm1 = (id == nproc - 1);
920  }
921  } else {
922  if ( tc > 0 ) {
923  init = 0;
924  limit = tc - 1;
925  pr->u.p.parm1 = TRUE;
926  } else {
927  // zero trip count
928  pr->u.p.count = 1; /* means no more chunks to execute */
929  pr->u.p.parm1 = FALSE;
930  break;
931  }
932  }
933 #if USE_ITT_BUILD
934  // Calculate chunk for metadata report
935  if ( itt_need_metadata_reporting )
936  cur_chunk = limit - init + 1;
937 #endif
938  if ( st == 1 ) {
939  pr->u.p.lb = lb + init;
940  pr->u.p.ub = lb + limit;
941  } else {
942  T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
943  pr->u.p.lb = lb + init * st;
944  // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
945  if ( st > 0 ) {
946  pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
947  } else {
948  pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
949  }
950  }
951  if ( pr->ordered ) {
952  pr->u.p.ordered_lower = init;
953  pr->u.p.ordered_upper = limit;
954  }
955  break;
956  } // case
957  case kmp_sch_guided_iterative_chunked :
958  {
959  T nproc = th->th.th_team_nproc;
960  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
961 
962  if ( nproc > 1 ) {
963  if ( (2L * chunk + 1 ) * nproc >= tc ) {
964  /* chunk size too large, switch to dynamic */
965  schedule = kmp_sch_dynamic_chunked;
966  } else {
967  // when remaining iters become less than parm2 - switch to dynamic
968  pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
969  *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
970  }
971  } else {
972  KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
973  schedule = kmp_sch_static_greedy;
974  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
975  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
976  pr->u.p.parm1 = tc;
977  } // if
978  } // case
979  break;
980  case kmp_sch_guided_analytical_chunked:
981  {
982  T nproc = th->th.th_team_nproc;
983  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
984 
985  if ( nproc > 1 ) {
986  if ( (2L * chunk + 1 ) * nproc >= tc ) {
987  /* chunk size too large, switch to dynamic */
988  schedule = kmp_sch_dynamic_chunked;
989  } else {
990  /* commonly used term: (2 nproc - 1)/(2 nproc) */
991  DBL x;
992 
993  #if KMP_OS_WINDOWS && KMP_ARCH_X86
994  /* Linux* OS already has 64-bit computation by default for
995  long double, and on Windows* OS on Intel(R) 64,
996  /Qlong_double doesn't work. On Windows* OS
997  on IA-32 architecture, we need to set precision to
998  64-bit instead of the default 53-bit. Even though long
999  double doesn't work on Windows* OS on Intel(R) 64, the
1000  resulting lack of precision is not expected to impact
1001  the correctness of the algorithm, but this has not been
1002  mathematically proven.
1003  */
1004  // save original FPCW and set precision to 64-bit, as
1005  // Windows* OS on IA-32 architecture defaults to 53-bit
1006  unsigned int oldFpcw = _control87(0,0);
1007  _control87(_PC_64,_MCW_PC); // 0,0x30000
1008  #endif
1009  /* value used for comparison in solver for cross-over point */
1010  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
1011 
1012  /* crossover point--chunk indexes equal to or greater than
1013  this point switch to dynamic-style scheduling */
1014  UT cross;
1015 
1016  /* commonly used term: (2 nproc - 1)/(2 nproc) */
1017  x = (long double)1.0 - (long double)0.5 / nproc;
1018 
1019  #ifdef KMP_DEBUG
1020  { // test natural alignment
1021  struct _test_a {
1022  char a;
1023  union {
1024  char b;
1025  DBL d;
1026  };
1027  } t;
1028  ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
1029  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
1030  KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
1031  }
1032  #endif // KMP_DEBUG
1033 
1034  /* save the term in thread private dispatch structure */
1035  *(DBL*)&pr->u.p.parm3 = x;
1036 
1037  /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1038  {
1039  UT left, right, mid;
1040  long double p;
1041 
1042  /* estimate initial upper and lower bound */
1043 
1044  /* doesn't matter what value right is as long as it is positive, but
1045  it affects performance of the solver
1046  */
1047  right = 229;
1048  p = __kmp_pow< UT >(x,right);
1049  if ( p > target ) {
1050  do{
1051  p *= p;
1052  right <<= 1;
1053  } while(p>target && right < (1<<27));
1054  left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1055  } else {
1056  left = 0;
1057  }
1058 
1059  /* bisection root-finding method */
1060  while ( left + 1 < right ) {
1061  mid = (left + right) / 2;
1062  if ( __kmp_pow< UT >(x,mid) > target ) {
1063  left = mid;
1064  } else {
1065  right = mid;
1066  }
1067  } // while
1068  cross = right;
1069  }
1070  /* assert sanity of computed crossover point */
1071  KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1072 
1073  /* save the crossover point in thread private dispatch structure */
1074  pr->u.p.parm2 = cross;
1075 
1076  // C75803
1077  #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1078  #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1079  #else
1080  #define GUIDED_ANALYTICAL_WORKAROUND (x)
1081  #endif
1082  /* dynamic-style scheduling offset */
1083  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1084  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1085  // restore FPCW
1086  _control87(oldFpcw,_MCW_PC);
1087  #endif
1088  } // if
1089  } else {
1090  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1091  gtid ) );
1092  schedule = kmp_sch_static_greedy;
1093  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1094  pr->u.p.parm1 = tc;
1095  } // if
1096  } // case
1097  break;
1098  case kmp_sch_static_greedy:
1099  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1100  pr->u.p.parm1 = ( th->th.th_team_nproc > 1 ) ?
1101  ( tc + th->th.th_team_nproc - 1 ) / th->th.th_team_nproc :
1102  tc;
1103  break;
1104  case kmp_sch_static_chunked :
1105  case kmp_sch_dynamic_chunked :
1106  if ( pr->u.p.parm1 <= 0 ) {
1107  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1108  }
1109  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1110  break;
1111  case kmp_sch_trapezoidal :
1112  {
1113  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1114 
1115  T parm1, parm2, parm3, parm4;
1116  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1117 
1118  parm1 = chunk;
1119 
1120  /* F : size of the first cycle */
1121  parm2 = ( tc / (2 * th->th.th_team_nproc) );
1122 
1123  if ( parm2 < 1 ) {
1124  parm2 = 1;
1125  }
1126 
1127  /* L : size of the last cycle. Make sure the last cycle
1128  * is not larger than the first cycle.
1129  */
1130  if ( parm1 < 1 ) {
1131  parm1 = 1;
1132  } else if ( parm1 > parm2 ) {
1133  parm1 = parm2;
1134  }
1135 
1136  /* N : number of cycles */
1137  parm3 = ( parm2 + parm1 );
1138  parm3 = ( 2 * tc + parm3 - 1) / parm3;
1139 
1140  if ( parm3 < 2 ) {
1141  parm3 = 2;
1142  }
1143 
1144  /* sigma : decreasing incr of the trapezoid */
1145  parm4 = ( parm3 - 1 );
1146  parm4 = ( parm2 - parm1 ) / parm4;
1147 
1148  // pointless check, because parm4 >= 0 always
1149  //if ( parm4 < 0 ) {
1150  // parm4 = 0;
1151  //}
1152 
1153  pr->u.p.parm1 = parm1;
1154  pr->u.p.parm2 = parm2;
1155  pr->u.p.parm3 = parm3;
1156  pr->u.p.parm4 = parm4;
1157  } // case
1158  break;
1159 
1160  default:
1161  {
1162  __kmp_msg(
1163  kmp_ms_fatal, // Severity
1164  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1165  KMP_HNT( GetNewerLibrary ), // Hint
1166  __kmp_msg_null // Variadic argument list terminator
1167  );
1168  }
1169  break;
1170  } // switch
1171  pr->schedule = schedule;
1172  if ( active ) {
1173  /* The name of this buffer should be my_buffer_index when it's free to use it */
1174 
1175  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1176  gtid, my_buffer_index, sh->buffer_index) );
1177  __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1178  USE_ITT_BUILD_ARG( NULL )
1179  );
1180  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1181  // *always* 32-bit integers.
1182  KMP_MB(); /* is this necessary? */
1183  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1184  gtid, my_buffer_index, sh->buffer_index) );
1185 
1186  th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1187  th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1188 #if USE_ITT_BUILD
1189  if ( pr->ordered ) {
1190  __kmp_itt_ordered_init( gtid );
1191  }; // if
1192  // Report loop metadata
1193  if ( itt_need_metadata_reporting ) {
1194  // Only report metadata by master of active team at level 1
1195  kmp_uint64 schedtype = 0;
1196  switch ( schedule ) {
1197  case kmp_sch_static_chunked:
1198  case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1199  break;
1200  case kmp_sch_static_greedy:
1201  cur_chunk = pr->u.p.parm1;
1202  break;
1203  case kmp_sch_dynamic_chunked:
1204  schedtype = 1;
1205  break;
1206  case kmp_sch_guided_iterative_chunked:
1207  case kmp_sch_guided_analytical_chunked:
1208  schedtype = 2;
1209  break;
1210  default:
1211 // Should we put this case under "static"?
1212 // case kmp_sch_static_steal:
1213  schedtype = 3;
1214  break;
1215  }
1216  __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1217  }
1218 #endif /* USE_ITT_BUILD */
1219  }; // if
1220 
1221  #ifdef KMP_DEBUG
1222  {
1223  const char * buff;
1224  // create format specifiers before the debug output
1225  buff = __kmp_str_format(
1226  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1227  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1228  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1229  traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1230  traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1231  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1232  traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1233  KD_TRACE(10, ( buff,
1234  gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1235  pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1236  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1237  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1238  __kmp_str_free( &buff );
1239  }
1240  #endif
1241  #if ( KMP_STATIC_STEAL_ENABLED )
1242  // It cannot be guaranteed that after execution of a loop with some other schedule kind
1243  // all the parm3 variables will contain the same value.
1244  // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1245  // rather than program life-time increment.
1246  // So the dedicated variable is required. The 'static_steal_counter' is used.
1247  if( schedule == kmp_sch_static_steal ) {
1248  // Other threads will inspect this variable when searching for a victim.
1249  // This is a flag showing that other threads may steal from this thread since then.
1250  volatile T * p = &pr->u.p.static_steal_counter;
1251  *p = *p + 1;
1252  }
1253  #endif // ( KMP_STATIC_STEAL_ENABLED )
1254 
1255 #if OMPT_SUPPORT && OMPT_TRACE
1256  if (ompt_enabled &&
1257  ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1258  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1259  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1260  ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1261  team_info->parallel_id, task_info->task_id, team_info->microtask);
1262  }
1263 #endif
1264 }
1265 
1266 /*
1267  * For ordered loops, either __kmp_dispatch_finish() should be called after
1268  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1269  * every chunk of iterations. If the ordered section(s) were not executed
1270  * for this iteration (or every iteration in this chunk), we need to set the
1271  * ordered iteration counters so that the next thread can proceed.
1272  */
1273 template< typename UT >
1274 static void
1275 __kmp_dispatch_finish( int gtid, ident_t *loc )
1276 {
1277  typedef typename traits_t< UT >::signed_t ST;
1278  kmp_info_t *th = __kmp_threads[ gtid ];
1279 
1280  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1281  if ( ! th -> th.th_team -> t.t_serialized ) {
1282 
1283  dispatch_private_info_template< UT > * pr =
1284  reinterpret_cast< dispatch_private_info_template< UT >* >
1285  ( th->th.th_dispatch->th_dispatch_pr_current );
1286  dispatch_shared_info_template< UT > volatile * sh =
1287  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1288  ( th->th.th_dispatch->th_dispatch_sh_current );
1289  KMP_DEBUG_ASSERT( pr );
1290  KMP_DEBUG_ASSERT( sh );
1291  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1292  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1293 
1294  if ( pr->ordered_bumped ) {
1295  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1296  gtid ) );
1297  pr->ordered_bumped = 0;
1298  } else {
1299  UT lower = pr->u.p.ordered_lower;
1300 
1301  #ifdef KMP_DEBUG
1302  {
1303  const char * buff;
1304  // create format specifiers before the debug output
1305  buff = __kmp_str_format(
1306  "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1307  traits_t< UT >::spec, traits_t< UT >::spec );
1308  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1309  __kmp_str_free( &buff );
1310  }
1311  #endif
1312 
1313  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1314  USE_ITT_BUILD_ARG(NULL)
1315  );
1316  KMP_MB(); /* is this necessary? */
1317  #ifdef KMP_DEBUG
1318  {
1319  const char * buff;
1320  // create format specifiers before the debug output
1321  buff = __kmp_str_format(
1322  "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1323  traits_t< UT >::spec, traits_t< UT >::spec );
1324  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1325  __kmp_str_free( &buff );
1326  }
1327  #endif
1328 
1329  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1330  } // if
1331  } // if
1332  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1333 }
1334 
1335 #ifdef KMP_GOMP_COMPAT
1336 
1337 template< typename UT >
1338 static void
1339 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1340 {
1341  typedef typename traits_t< UT >::signed_t ST;
1342  kmp_info_t *th = __kmp_threads[ gtid ];
1343 
1344  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1345  if ( ! th -> th.th_team -> t.t_serialized ) {
1346 // int cid;
1347  dispatch_private_info_template< UT > * pr =
1348  reinterpret_cast< dispatch_private_info_template< UT >* >
1349  ( th->th.th_dispatch->th_dispatch_pr_current );
1350  dispatch_shared_info_template< UT > volatile * sh =
1351  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1352  ( th->th.th_dispatch->th_dispatch_sh_current );
1353  KMP_DEBUG_ASSERT( pr );
1354  KMP_DEBUG_ASSERT( sh );
1355  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1356  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1357 
1358 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1359  UT lower = pr->u.p.ordered_lower;
1360  UT upper = pr->u.p.ordered_upper;
1361  UT inc = upper - lower + 1;
1362 
1363  if ( pr->ordered_bumped == inc ) {
1364  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1365  gtid ) );
1366  pr->ordered_bumped = 0;
1367  } else {
1368  inc -= pr->ordered_bumped;
1369 
1370  #ifdef KMP_DEBUG
1371  {
1372  const char * buff;
1373  // create format specifiers before the debug output
1374  buff = __kmp_str_format(
1375  "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1376  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1377  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1378  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1379  __kmp_str_free( &buff );
1380  }
1381  #endif
1382 
1383  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1384  USE_ITT_BUILD_ARG(NULL)
1385  );
1386 
1387  KMP_MB(); /* is this necessary? */
1388  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1389  gtid ) );
1390  pr->ordered_bumped = 0;
1392  #ifdef KMP_DEBUG
1393  {
1394  const char * buff;
1395  // create format specifiers before the debug output
1396  buff = __kmp_str_format(
1397  "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1398  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1399  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1400  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1401  __kmp_str_free( &buff );
1402  }
1403  #endif
1404 
1405  test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1406  }
1407 // }
1408  }
1409  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1410 }
1411 
1412 #endif /* KMP_GOMP_COMPAT */
1413 
1414 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1415  * (no more work), then tell OMPT the loop is over. In some cases
1416  * kmp_dispatch_fini() is not called. */
1417 #if OMPT_SUPPORT && OMPT_TRACE
1418 #define OMPT_LOOP_END \
1419  if (status == 0) { \
1420  if (ompt_enabled && \
1421  ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1422  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1423  ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1424  ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1425  team_info->parallel_id, task_info->task_id); \
1426  } \
1427  }
1428 #else
1429 #define OMPT_LOOP_END // no-op
1430 #endif
1431 
1432 template< typename T >
1433 static int
1434 __kmp_dispatch_next(
1435  ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1436 ) {
1437 
1438  typedef typename traits_t< T >::unsigned_t UT;
1439  typedef typename traits_t< T >::signed_t ST;
1440  typedef typename traits_t< T >::floating_t DBL;
1441 #if ( KMP_STATIC_STEAL_ENABLED )
1442  static const int ___kmp_size_type = sizeof( UT );
1443 #endif
1444 
1445  // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1446  // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1447  // more than a compile time choice to use static scheduling would.)
1448  KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1449 
1450  int status;
1451  dispatch_private_info_template< T > * pr;
1452  kmp_info_t * th = __kmp_threads[ gtid ];
1453  kmp_team_t * team = th -> th.th_team;
1454 
1455  KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1456  #ifdef KMP_DEBUG
1457  {
1458  const char * buff;
1459  // create format specifiers before the debug output
1460  buff = __kmp_str_format(
1461  "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1462  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1463  KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1464  __kmp_str_free( &buff );
1465  }
1466  #endif
1467 
1468  if ( team -> t.t_serialized ) {
1469  /* NOTE: serialize this dispatch becase we are not at the active level */
1470  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1471  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1472  KMP_DEBUG_ASSERT( pr );
1473 
1474  if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1475  *p_lb = 0;
1476  *p_ub = 0;
1477 // if ( p_last != NULL )
1478 // *p_last = 0;
1479  if ( p_st != NULL )
1480  *p_st = 0;
1481  if ( __kmp_env_consistency_check ) {
1482  if ( pr->pushed_ws != ct_none ) {
1483  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1484  }
1485  }
1486  } else if ( pr->nomerge ) {
1487  kmp_int32 last;
1488  T start;
1489  UT limit, trip, init;
1490  ST incr;
1491  T chunk = pr->u.p.parm1;
1492 
1493  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1494 
1495  init = chunk * pr->u.p.count++;
1496  trip = pr->u.p.tc - 1;
1497 
1498  if ( (status = (init <= trip)) == 0 ) {
1499  *p_lb = 0;
1500  *p_ub = 0;
1501 // if ( p_last != NULL )
1502 // *p_last = 0;
1503  if ( p_st != NULL )
1504  *p_st = 0;
1505  if ( __kmp_env_consistency_check ) {
1506  if ( pr->pushed_ws != ct_none ) {
1507  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1508  }
1509  }
1510  } else {
1511  start = pr->u.p.lb;
1512  limit = chunk + init - 1;
1513  incr = pr->u.p.st;
1514 
1515  if ( (last = (limit >= trip)) != 0 ) {
1516  limit = trip;
1517  #if KMP_OS_WINDOWS
1518  pr->u.p.last_upper = pr->u.p.ub;
1519  #endif /* KMP_OS_WINDOWS */
1520  }
1521  if ( p_last != NULL )
1522  *p_last = last;
1523  if ( p_st != NULL )
1524  *p_st = incr;
1525  if ( incr == 1 ) {
1526  *p_lb = start + init;
1527  *p_ub = start + limit;
1528  } else {
1529  *p_lb = start + init * incr;
1530  *p_ub = start + limit * incr;
1531  }
1532 
1533  if ( pr->ordered ) {
1534  pr->u.p.ordered_lower = init;
1535  pr->u.p.ordered_upper = limit;
1536  #ifdef KMP_DEBUG
1537  {
1538  const char * buff;
1539  // create format specifiers before the debug output
1540  buff = __kmp_str_format(
1541  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1542  traits_t< UT >::spec, traits_t< UT >::spec );
1543  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1544  __kmp_str_free( &buff );
1545  }
1546  #endif
1547  } // if
1548  } // if
1549  } else {
1550  pr->u.p.tc = 0;
1551  *p_lb = pr->u.p.lb;
1552  *p_ub = pr->u.p.ub;
1553  #if KMP_OS_WINDOWS
1554  pr->u.p.last_upper = *p_ub;
1555  #endif /* KMP_OS_WINDOWS */
1556  if ( p_last != NULL )
1557  *p_last = TRUE;
1558  if ( p_st != NULL )
1559  *p_st = pr->u.p.st;
1560  } // if
1561  #ifdef KMP_DEBUG
1562  {
1563  const char * buff;
1564  // create format specifiers before the debug output
1565  buff = __kmp_str_format(
1566  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1567  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1568  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1569  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1570  __kmp_str_free( &buff );
1571  }
1572  #endif
1573 #if INCLUDE_SSC_MARKS
1574  SSC_MARK_DISPATCH_NEXT();
1575 #endif
1576  OMPT_LOOP_END;
1577  return status;
1578  } else {
1579  kmp_int32 last = 0;
1580  dispatch_shared_info_template< UT > *sh;
1581  T start;
1582  ST incr;
1583  UT limit, trip, init;
1584 
1585  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1586  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1587 
1588  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1589  ( th->th.th_dispatch->th_dispatch_pr_current );
1590  KMP_DEBUG_ASSERT( pr );
1591  sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1592  ( th->th.th_dispatch->th_dispatch_sh_current );
1593  KMP_DEBUG_ASSERT( sh );
1594 
1595  if ( pr->u.p.tc == 0 ) {
1596  // zero trip count
1597  status = 0;
1598  } else {
1599  switch (pr->schedule) {
1600  #if ( KMP_STATIC_STEAL_ENABLED )
1601  case kmp_sch_static_steal:
1602  {
1603  T chunk = pr->u.p.parm1;
1604  int nproc = th->th.th_team_nproc;
1605 
1606  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1607 
1608  trip = pr->u.p.tc - 1;
1609 
1610  if ( ___kmp_size_type > 4 ) {
1611  // use lock for 8-byte and CAS for 4-byte induction
1612  // variable. TODO (optional): check and use 16-byte CAS
1613  kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock;
1614  KMP_DEBUG_ASSERT(lck != NULL);
1615  if( pr->u.p.count < (UT)pr->u.p.ub ) {
1616  __kmp_acquire_lock(lck, gtid);
1617  // try to get own chunk of iterations
1618  init = ( pr->u.p.count )++;
1619  status = ( init < (UT)pr->u.p.ub );
1620  __kmp_release_lock(lck, gtid);
1621  } else {
1622  status = 0; // no own chunks
1623  }
1624  if( !status ) { // try to steal
1625  kmp_info_t **other_threads = team->t.t_threads;
1626  int while_limit = nproc; // nproc attempts to find a victim
1627  int while_index = 0;
1628  // TODO: algorithm of searching for a victim
1629  // should be cleaned up and measured
1630  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1631  T remaining;
1632  T victimIdx = pr->u.p.parm4;
1633  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1634  dispatch_private_info_template< T > * victim =
1635  reinterpret_cast< dispatch_private_info_template< T >* >
1636  (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1637  while( ( victim == NULL || victim == pr ||
1638  ( *(volatile T*)&victim->u.p.static_steal_counter !=
1639  *(volatile T*)&pr->u.p.static_steal_counter ) ) &&
1640  oldVictimIdx != victimIdx )
1641  {
1642  victimIdx = (victimIdx + 1) % nproc;
1643  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1644  (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1645  };
1646  if( !victim ||
1647  ( *(volatile T *)&victim->u.p.static_steal_counter !=
1648  *(volatile T *)&pr->u.p.static_steal_counter ) )
1649  {
1650  continue; // try once more (nproc attempts in total)
1651  // no victim is ready yet to participate in stealing
1652  // because all victims are still in kmp_init_dispatch
1653  }
1654  if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) {
1655  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1656  continue; // not enough chunks to steal, goto next victim
1657  }
1658 
1659  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1660  KMP_ASSERT(lck != NULL);
1661  __kmp_acquire_lock(lck, gtid);
1662  limit = victim->u.p.ub; // keep initial ub
1663  if( victim->u.p.count >= limit ||
1664  (remaining = limit - victim->u.p.count) < 2 )
1665  {
1666  __kmp_release_lock(lck, gtid);
1667  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1668  continue; // not enough chunks to steal
1669  }
1670  // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1
1671  if( remaining > 3 ) {
1672  init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining
1673  } else {
1674  init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining
1675  }
1676  __kmp_release_lock(lck, gtid);
1677 
1678  KMP_DEBUG_ASSERT(init + 1 <= limit);
1679  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1680  status = 1;
1681  while_index = 0;
1682  // now update own count and ub with stolen range but init chunk
1683  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1684  pr->u.p.count = init + 1;
1685  pr->u.p.ub = limit;
1686  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1687  } // while (search for victim)
1688  } // if (try to find victim and steal)
1689  } else {
1690  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1691  typedef union {
1692  struct {
1693  UT count;
1694  T ub;
1695  } p;
1696  kmp_int64 b;
1697  } union_i4;
1698  // All operations on 'count' or 'ub' must be combined atomically together.
1699  {
1700  union_i4 vold, vnew;
1701  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1702  vnew = vold;
1703  vnew.p.count++;
1704  while( ! KMP_COMPARE_AND_STORE_ACQ64(
1705  ( volatile kmp_int64* )&pr->u.p.count,
1706  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1707  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1708  KMP_CPU_PAUSE();
1709  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1710  vnew = vold;
1711  vnew.p.count++;
1712  }
1713  vnew = vold;
1714  init = vnew.p.count;
1715  status = ( init < (UT)vnew.p.ub ) ;
1716  }
1717 
1718  if( !status ) {
1719  kmp_info_t **other_threads = team->t.t_threads;
1720  int while_limit = nproc; // nproc attempts to find a victim
1721  int while_index = 0;
1722 
1723  // TODO: algorithm of searching for a victim
1724  // should be cleaned up and measured
1725  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1726  union_i4 vold, vnew;
1727  kmp_int32 remaining;
1728  T victimIdx = pr->u.p.parm4;
1729  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1730  dispatch_private_info_template< T > * victim =
1731  reinterpret_cast< dispatch_private_info_template< T >* >
1732  (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1733  while( (victim == NULL || victim == pr ||
1734  (*(volatile T*)&victim->u.p.static_steal_counter !=
1735  *(volatile T*)&pr->u.p.static_steal_counter)) &&
1736  oldVictimIdx != victimIdx )
1737  {
1738  victimIdx = (victimIdx + 1) % nproc;
1739  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1740  ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1741  };
1742  if( !victim ||
1743  ( *(volatile T *)&victim->u.p.static_steal_counter !=
1744  *(volatile T *)&pr->u.p.static_steal_counter ) )
1745  {
1746  continue; // try once more (nproc attempts in total)
1747  // no victim is ready yet to participate in stealing
1748  // because all victims are still in kmp_init_dispatch
1749  }
1750  pr->u.p.parm4 = victimIdx; // new victim found
1751  while( 1 ) { // CAS loop if victim has enough chunks to steal
1752  vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1753  vnew = vold;
1754 
1755  KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1756  if ( vnew.p.count >= (UT)vnew.p.ub ||
1757  (remaining = vnew.p.ub - vnew.p.count) < 2 )
1758  {
1759  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1760  break; // not enough chunks to steal, goto next victim
1761  }
1762  if( remaining > 3 ) {
1763  vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining
1764  } else {
1765  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1766  }
1767  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1768  // TODO: Should this be acquire or release?
1769  if ( KMP_COMPARE_AND_STORE_ACQ64(
1770  ( volatile kmp_int64 * )&victim->u.p.count,
1771  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1772  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1773  // stealing succedded
1774  status = 1;
1775  while_index = 0;
1776  // now update own count and ub
1777  init = vnew.p.ub;
1778  vold.p.count = init + 1;
1779  #if KMP_ARCH_X86
1780  KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b);
1781  #else
1782  *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1783  #endif
1784  break;
1785  } // if (check CAS result)
1786  KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1787  } // while (try to steal from particular victim)
1788  } // while (search for victim)
1789  } // if (try to find victim and steal)
1790  } // if (4-byte induction variable)
1791  if ( !status ) {
1792  *p_lb = 0;
1793  *p_ub = 0;
1794  if ( p_st != NULL ) *p_st = 0;
1795  } else {
1796  start = pr->u.p.parm2;
1797  init *= chunk;
1798  limit = chunk + init - 1;
1799  incr = pr->u.p.st;
1800 
1801  KMP_DEBUG_ASSERT(init <= trip);
1802  if ( (last = (limit >= trip)) != 0 )
1803  limit = trip;
1804  if ( p_st != NULL ) *p_st = incr;
1805 
1806  if ( incr == 1 ) {
1807  *p_lb = start + init;
1808  *p_ub = start + limit;
1809  } else {
1810  *p_lb = start + init * incr;
1811  *p_ub = start + limit * incr;
1812  }
1813 
1814  if ( pr->ordered ) {
1815  pr->u.p.ordered_lower = init;
1816  pr->u.p.ordered_upper = limit;
1817  #ifdef KMP_DEBUG
1818  {
1819  const char * buff;
1820  // create format specifiers before the debug output
1821  buff = __kmp_str_format(
1822  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1823  traits_t< UT >::spec, traits_t< UT >::spec );
1824  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1825  __kmp_str_free( &buff );
1826  }
1827  #endif
1828  } // if
1829  } // if
1830  break;
1831  } // case
1832  #endif // ( KMP_STATIC_STEAL_ENABLED )
1833  case kmp_sch_static_balanced:
1834  {
1835  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1836  if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1837  pr->u.p.count = 1;
1838  *p_lb = pr->u.p.lb;
1839  *p_ub = pr->u.p.ub;
1840  last = pr->u.p.parm1;
1841  if ( p_st != NULL )
1842  *p_st = pr->u.p.st;
1843  } else { /* no iterations to do */
1844  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1845  }
1846  if ( pr->ordered ) {
1847  #ifdef KMP_DEBUG
1848  {
1849  const char * buff;
1850  // create format specifiers before the debug output
1851  buff = __kmp_str_format(
1852  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1853  traits_t< UT >::spec, traits_t< UT >::spec );
1854  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1855  __kmp_str_free( &buff );
1856  }
1857  #endif
1858  } // if
1859  } // case
1860  break;
1861  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1862  case kmp_sch_static_chunked:
1863  {
1864  T parm1;
1865 
1866  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1867  gtid ) );
1868  parm1 = pr->u.p.parm1;
1869 
1870  trip = pr->u.p.tc - 1;
1871  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1872 
1873  if ( (status = (init <= trip)) != 0 ) {
1874  start = pr->u.p.lb;
1875  incr = pr->u.p.st;
1876  limit = parm1 + init - 1;
1877 
1878  if ( (last = (limit >= trip)) != 0 )
1879  limit = trip;
1880 
1881  if ( p_st != NULL ) *p_st = incr;
1882 
1883  pr->u.p.count += th->th.th_team_nproc;
1884 
1885  if ( incr == 1 ) {
1886  *p_lb = start + init;
1887  *p_ub = start + limit;
1888  }
1889  else {
1890  *p_lb = start + init * incr;
1891  *p_ub = start + limit * incr;
1892  }
1893 
1894  if ( pr->ordered ) {
1895  pr->u.p.ordered_lower = init;
1896  pr->u.p.ordered_upper = limit;
1897  #ifdef KMP_DEBUG
1898  {
1899  const char * buff;
1900  // create format specifiers before the debug output
1901  buff = __kmp_str_format(
1902  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1903  traits_t< UT >::spec, traits_t< UT >::spec );
1904  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1905  __kmp_str_free( &buff );
1906  }
1907  #endif
1908  } // if
1909  } // if
1910  } // case
1911  break;
1912 
1913  case kmp_sch_dynamic_chunked:
1914  {
1915  T chunk = pr->u.p.parm1;
1916 
1917  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1918  gtid ) );
1919 
1920  init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1921  trip = pr->u.p.tc - 1;
1922 
1923  if ( (status = (init <= trip)) == 0 ) {
1924  *p_lb = 0;
1925  *p_ub = 0;
1926  if ( p_st != NULL ) *p_st = 0;
1927  } else {
1928  start = pr->u.p.lb;
1929  limit = chunk + init - 1;
1930  incr = pr->u.p.st;
1931 
1932  if ( (last = (limit >= trip)) != 0 )
1933  limit = trip;
1934 
1935  if ( p_st != NULL ) *p_st = incr;
1936 
1937  if ( incr == 1 ) {
1938  *p_lb = start + init;
1939  *p_ub = start + limit;
1940  } else {
1941  *p_lb = start + init * incr;
1942  *p_ub = start + limit * incr;
1943  }
1944 
1945  if ( pr->ordered ) {
1946  pr->u.p.ordered_lower = init;
1947  pr->u.p.ordered_upper = limit;
1948  #ifdef KMP_DEBUG
1949  {
1950  const char * buff;
1951  // create format specifiers before the debug output
1952  buff = __kmp_str_format(
1953  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1954  traits_t< UT >::spec, traits_t< UT >::spec );
1955  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1956  __kmp_str_free( &buff );
1957  }
1958  #endif
1959  } // if
1960  } // if
1961  } // case
1962  break;
1963 
1964  case kmp_sch_guided_iterative_chunked:
1965  {
1966  T chunkspec = pr->u.p.parm1;
1967  KD_TRACE(100,
1968  ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1969  trip = pr->u.p.tc;
1970  // Start atomic part of calculations
1971  while(1) {
1972  ST remaining; // signed, because can be < 0
1973  init = sh->u.s.iteration; // shared value
1974  remaining = trip - init;
1975  if ( remaining <= 0 ) { // AC: need to compare with 0 first
1976  // nothing to do, don't try atomic op
1977  status = 0;
1978  break;
1979  }
1980  if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1981  // use dynamic-style shcedule
1982  // atomically inrement iterations, get old value
1983  init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1984  remaining = trip - init;
1985  if (remaining <= 0) {
1986  status = 0; // all iterations got by other threads
1987  } else {
1988  // got some iterations to work on
1989  status = 1;
1990  if ( (T)remaining > chunkspec ) {
1991  limit = init + chunkspec - 1;
1992  } else {
1993  last = 1; // the last chunk
1994  limit = init + remaining - 1;
1995  } // if
1996  } // if
1997  break;
1998  } // if
1999  limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
2000  if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
2001  // CAS was successful, chunk obtained
2002  status = 1;
2003  --limit;
2004  break;
2005  } // if
2006  } // while
2007  if ( status != 0 ) {
2008  start = pr->u.p.lb;
2009  incr = pr->u.p.st;
2010  if ( p_st != NULL )
2011  *p_st = incr;
2012  *p_lb = start + init * incr;
2013  *p_ub = start + limit * incr;
2014  if ( pr->ordered ) {
2015  pr->u.p.ordered_lower = init;
2016  pr->u.p.ordered_upper = limit;
2017  #ifdef KMP_DEBUG
2018  {
2019  const char * buff;
2020  // create format specifiers before the debug output
2021  buff = __kmp_str_format(
2022  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2023  traits_t< UT >::spec, traits_t< UT >::spec );
2024  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2025  __kmp_str_free( &buff );
2026  }
2027  #endif
2028  } // if
2029  } else {
2030  *p_lb = 0;
2031  *p_ub = 0;
2032  if ( p_st != NULL )
2033  *p_st = 0;
2034  } // if
2035  } // case
2036  break;
2037 
2038  case kmp_sch_guided_analytical_chunked:
2039  {
2040  T chunkspec = pr->u.p.parm1;
2041  UT chunkIdx;
2042  #if KMP_OS_WINDOWS && KMP_ARCH_X86
2043  /* for storing original FPCW value for Windows* OS on
2044  IA-32 architecture 8-byte version */
2045  unsigned int oldFpcw;
2046  unsigned int fpcwSet = 0;
2047  #endif
2048  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
2049  gtid ) );
2050 
2051  trip = pr->u.p.tc;
2052 
2053  KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2054  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < trip);
2055 
2056  while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
2057  chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
2058  if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
2059  --trip;
2060  /* use dynamic-style scheduling */
2061  init = chunkIdx * chunkspec + pr->u.p.count;
2062  /* need to verify init > 0 in case of overflow in the above calculation */
2063  if ( (status = (init > 0 && init <= trip)) != 0 ) {
2064  limit = init + chunkspec -1;
2065 
2066  if ( (last = (limit >= trip)) != 0 )
2067  limit = trip;
2068  }
2069  break;
2070  } else {
2071  /* use exponential-style scheduling */
2072  /* The following check is to workaround the lack of long double precision on Windows* OS.
2073  This check works around the possible effect that init != 0 for chunkIdx == 0.
2074  */
2075  #if KMP_OS_WINDOWS && KMP_ARCH_X86
2076  /* If we haven't already done so, save original
2077  FPCW and set precision to 64-bit, as Windows* OS
2078  on IA-32 architecture defaults to 53-bit */
2079  if ( !fpcwSet ) {
2080  oldFpcw = _control87(0,0);
2081  _control87(_PC_64,_MCW_PC);
2082  fpcwSet = 0x30000;
2083  }
2084  #endif
2085  if ( chunkIdx ) {
2086  init = __kmp_dispatch_guided_remaining< T >(
2087  trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
2088  KMP_DEBUG_ASSERT(init);
2089  init = trip - init;
2090  } else
2091  init = 0;
2092  limit = trip - __kmp_dispatch_guided_remaining< T >(
2093  trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
2094  KMP_ASSERT(init <= limit);
2095  if ( init < limit ) {
2096  KMP_DEBUG_ASSERT(limit <= trip);
2097  --limit;
2098  status = 1;
2099  break;
2100  } // if
2101  } // if
2102  } // while (1)
2103  #if KMP_OS_WINDOWS && KMP_ARCH_X86
2104  /* restore FPCW if necessary
2105  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2106  */
2107  if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2108  _control87(oldFpcw,_MCW_PC);
2109  #endif
2110  if ( status != 0 ) {
2111  start = pr->u.p.lb;
2112  incr = pr->u.p.st;
2113  if ( p_st != NULL )
2114  *p_st = incr;
2115  *p_lb = start + init * incr;
2116  *p_ub = start + limit * incr;
2117  if ( pr->ordered ) {
2118  pr->u.p.ordered_lower = init;
2119  pr->u.p.ordered_upper = limit;
2120  #ifdef KMP_DEBUG
2121  {
2122  const char * buff;
2123  // create format specifiers before the debug output
2124  buff = __kmp_str_format(
2125  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2126  traits_t< UT >::spec, traits_t< UT >::spec );
2127  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2128  __kmp_str_free( &buff );
2129  }
2130  #endif
2131  }
2132  } else {
2133  *p_lb = 0;
2134  *p_ub = 0;
2135  if ( p_st != NULL )
2136  *p_st = 0;
2137  }
2138  } // case
2139  break;
2140 
2141  case kmp_sch_trapezoidal:
2142  {
2143  UT index;
2144  T parm2 = pr->u.p.parm2;
2145  T parm3 = pr->u.p.parm3;
2146  T parm4 = pr->u.p.parm4;
2147  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2148  gtid ) );
2149 
2150  index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2151 
2152  init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2153  trip = pr->u.p.tc - 1;
2154 
2155  if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2156  *p_lb = 0;
2157  *p_ub = 0;
2158  if ( p_st != NULL ) *p_st = 0;
2159  } else {
2160  start = pr->u.p.lb;
2161  limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2162  incr = pr->u.p.st;
2163 
2164  if ( (last = (limit >= trip)) != 0 )
2165  limit = trip;
2166 
2167  if ( p_st != NULL ) *p_st = incr;
2168 
2169  if ( incr == 1 ) {
2170  *p_lb = start + init;
2171  *p_ub = start + limit;
2172  } else {
2173  *p_lb = start + init * incr;
2174  *p_ub = start + limit * incr;
2175  }
2176 
2177  if ( pr->ordered ) {
2178  pr->u.p.ordered_lower = init;
2179  pr->u.p.ordered_upper = limit;
2180  #ifdef KMP_DEBUG
2181  {
2182  const char * buff;
2183  // create format specifiers before the debug output
2184  buff = __kmp_str_format(
2185  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2186  traits_t< UT >::spec, traits_t< UT >::spec );
2187  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2188  __kmp_str_free( &buff );
2189  }
2190  #endif
2191  } // if
2192  } // if
2193  } // case
2194  break;
2195  default:
2196  {
2197  status = 0; // to avoid complaints on uninitialized variable use
2198  __kmp_msg(
2199  kmp_ms_fatal, // Severity
2200  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2201  KMP_HNT( GetNewerLibrary ), // Hint
2202  __kmp_msg_null // Variadic argument list terminator
2203  );
2204  }
2205  break;
2206  } // switch
2207  } // if tc == 0;
2208 
2209  if ( status == 0 ) {
2210  UT num_done;
2211 
2212  num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2213  #ifdef KMP_DEBUG
2214  {
2215  const char * buff;
2216  // create format specifiers before the debug output
2217  buff = __kmp_str_format(
2218  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2219  traits_t< UT >::spec );
2220  KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2221  __kmp_str_free( &buff );
2222  }
2223  #endif
2224 
2225  if ( (ST)num_done == th->th.th_team_nproc - 1 ) {
2226  #if ( KMP_STATIC_STEAL_ENABLED )
2227  if( pr->schedule == kmp_sch_static_steal && ___kmp_size_type > 4 ) {
2228  int i;
2229  kmp_info_t **other_threads = team->t.t_threads;
2230  // loop complete, safe to destroy locks used for stealing
2231  for( i = 0; i < th->th.th_team_nproc; ++i ) {
2232  kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2233  KMP_ASSERT(lck != NULL);
2234  __kmp_destroy_lock( lck );
2235  __kmp_free( lck );
2236  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2237  }
2238  }
2239  #endif
2240  /* NOTE: release this buffer to be reused */
2241 
2242  KMP_MB(); /* Flush all pending memory write invalidates. */
2243 
2244  sh->u.s.num_done = 0;
2245  sh->u.s.iteration = 0;
2246 
2247  /* TODO replace with general release procedure? */
2248  if ( pr->ordered ) {
2249  sh->u.s.ordered_iteration = 0;
2250  }
2251 
2252  KMP_MB(); /* Flush all pending memory write invalidates. */
2253 
2254  sh -> buffer_index += __kmp_dispatch_num_buffers;
2255  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2256  gtid, sh->buffer_index) );
2257 
2258  KMP_MB(); /* Flush all pending memory write invalidates. */
2259 
2260  } // if
2261  if ( __kmp_env_consistency_check ) {
2262  if ( pr->pushed_ws != ct_none ) {
2263  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2264  }
2265  }
2266 
2267  th -> th.th_dispatch -> th_deo_fcn = NULL;
2268  th -> th.th_dispatch -> th_dxo_fcn = NULL;
2269  th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2270  th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2271  } // if (status == 0)
2272 #if KMP_OS_WINDOWS
2273  else if ( last ) {
2274  pr->u.p.last_upper = pr->u.p.ub;
2275  }
2276 #endif /* KMP_OS_WINDOWS */
2277  if ( p_last != NULL && status != 0 )
2278  *p_last = last;
2279  } // if
2280 
2281  #ifdef KMP_DEBUG
2282  {
2283  const char * buff;
2284  // create format specifiers before the debug output
2285  buff = __kmp_str_format(
2286  "__kmp_dispatch_next: T#%%d normal case: " \
2287  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2288  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2289  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2290  __kmp_str_free( &buff );
2291  }
2292  #endif
2293 #if INCLUDE_SSC_MARKS
2294  SSC_MARK_DISPATCH_NEXT();
2295 #endif
2296  OMPT_LOOP_END;
2297  return status;
2298 }
2299 
2300 template< typename T >
2301 static void
2302 __kmp_dist_get_bounds(
2303  ident_t *loc,
2304  kmp_int32 gtid,
2305  kmp_int32 *plastiter,
2306  T *plower,
2307  T *pupper,
2308  typename traits_t< T >::signed_t incr
2309 ) {
2310  typedef typename traits_t< T >::unsigned_t UT;
2311  typedef typename traits_t< T >::signed_t ST;
2312  register kmp_uint32 team_id;
2313  register kmp_uint32 nteams;
2314  register UT trip_count;
2315  register kmp_team_t *team;
2316  kmp_info_t * th;
2317 
2318  KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2319  KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2320  #ifdef KMP_DEBUG
2321  {
2322  const char * buff;
2323  // create format specifiers before the debug output
2324  buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2325  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2326  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2327  traits_t< T >::spec );
2328  KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2329  __kmp_str_free( &buff );
2330  }
2331  #endif
2332 
2333  if( __kmp_env_consistency_check ) {
2334  if( incr == 0 ) {
2335  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2336  }
2337  if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2338  // The loop is illegal.
2339  // Some zero-trip loops maintained by compiler, e.g.:
2340  // for(i=10;i<0;++i) // lower >= upper - run-time check
2341  // for(i=0;i>10;--i) // lower <= upper - run-time check
2342  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2343  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2344  // Compiler does not check the following illegal loops:
2345  // for(i=0;i<10;i+=incr) // where incr<0
2346  // for(i=10;i>0;i-=incr) // where incr<0
2347  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2348  }
2349  }
2350  th = __kmp_threads[gtid];
2351  team = th->th.th_team;
2352  #if OMP_40_ENABLED
2353  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2354  nteams = th->th.th_teams_size.nteams;
2355  #endif
2356  team_id = team->t.t_master_tid;
2357  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2358 
2359  // compute global trip count
2360  if( incr == 1 ) {
2361  trip_count = *pupper - *plower + 1;
2362  } else if(incr == -1) {
2363  trip_count = *plower - *pupper + 1;
2364  } else if ( incr > 0 ) {
2365  // upper-lower can exceed the limit of signed type
2366  trip_count = (UT)(*pupper - *plower) / incr + 1;
2367  } else {
2368  trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1;
2369  }
2370 
2371  if( trip_count <= nteams ) {
2372  KMP_DEBUG_ASSERT(
2373  __kmp_static == kmp_sch_static_greedy || \
2374  __kmp_static == kmp_sch_static_balanced
2375  ); // Unknown static scheduling type.
2376  // only some teams get single iteration, others get nothing
2377  if( team_id < trip_count ) {
2378  *pupper = *plower = *plower + team_id * incr;
2379  } else {
2380  *plower = *pupper + incr; // zero-trip loop
2381  }
2382  if( plastiter != NULL )
2383  *plastiter = ( team_id == trip_count - 1 );
2384  } else {
2385  if( __kmp_static == kmp_sch_static_balanced ) {
2386  register UT chunk = trip_count / nteams;
2387  register UT extras = trip_count % nteams;
2388  *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2389  *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2390  if( plastiter != NULL )
2391  *plastiter = ( team_id == nteams - 1 );
2392  } else {
2393  register T chunk_inc_count =
2394  ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2395  register T upper = *pupper;
2396  KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2397  // Unknown static scheduling type.
2398  *plower += team_id * chunk_inc_count;
2399  *pupper = *plower + chunk_inc_count - incr;
2400  // Check/correct bounds if needed
2401  if( incr > 0 ) {
2402  if( *pupper < *plower )
2403  *pupper = i_maxmin< T >::mx;
2404  if( plastiter != NULL )
2405  *plastiter = *plower <= upper && *pupper > upper - incr;
2406  if( *pupper > upper )
2407  *pupper = upper; // tracker C73258
2408  } else {
2409  if( *pupper > *plower )
2410  *pupper = i_maxmin< T >::mn;
2411  if( plastiter != NULL )
2412  *plastiter = *plower >= upper && *pupper < upper - incr;
2413  if( *pupper < upper )
2414  *pupper = upper; // tracker C73258
2415  }
2416  }
2417  }
2418 }
2419 
2420 //-----------------------------------------------------------------------------------------
2421 // Dispatch routines
2422 // Transfer call to template< type T >
2423 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2424 // T lb, T ub, ST st, ST chunk )
2425 extern "C" {
2426 
2442 void
2443 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2444  kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2445 {
2446  KMP_DEBUG_ASSERT( __kmp_init_serial );
2447  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2448 }
2452 void
2453 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2454  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2455 {
2456  KMP_DEBUG_ASSERT( __kmp_init_serial );
2457  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2458 }
2459 
2463 void
2464 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2465  kmp_int64 lb, kmp_int64 ub,
2466  kmp_int64 st, kmp_int64 chunk )
2467 {
2468  KMP_DEBUG_ASSERT( __kmp_init_serial );
2469  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2470 }
2471 
2475 void
2476 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2477  kmp_uint64 lb, kmp_uint64 ub,
2478  kmp_int64 st, kmp_int64 chunk )
2479 {
2480  KMP_DEBUG_ASSERT( __kmp_init_serial );
2481  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2482 }
2483 
2493 void
2494 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2495  kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2496 {
2497  KMP_DEBUG_ASSERT( __kmp_init_serial );
2498  __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2499  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2500 }
2501 
2502 void
2503 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2504  kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2505 {
2506  KMP_DEBUG_ASSERT( __kmp_init_serial );
2507  __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2508  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2509 }
2510 
2511 void
2512 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2513  kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2514 {
2515  KMP_DEBUG_ASSERT( __kmp_init_serial );
2516  __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2517  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2518 }
2519 
2520 void
2521 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2522  kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2523 {
2524  KMP_DEBUG_ASSERT( __kmp_init_serial );
2525  __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2526  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2527 }
2528 
2541 int
2542 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2543  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2544 {
2545  return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2546 }
2547 
2551 int
2552 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2553  kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2554 {
2555  return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2556 }
2557 
2561 int
2562 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2563  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2564 {
2565  return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2566 }
2567 
2571 int
2572 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2573  kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2574 {
2575  return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2576 }
2577 
2584 void
2585 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2586 {
2587  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2588 }
2589 
2593 void
2594 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2595 {
2596  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2597 }
2598 
2602 void
2603 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2604 {
2605  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2606 }
2607 
2611 void
2612 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2613 {
2614  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2615 }
2618 //-----------------------------------------------------------------------------------------
2619 //Non-template routines from kmp_dispatch.c used in other sources
2620 
2621 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2622  return value == checker;
2623 }
2624 
2625 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2626  return value != checker;
2627 }
2628 
2629 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2630  return value < checker;
2631 }
2632 
2633 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2634  return value >= checker;
2635 }
2636 
2637 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2638  return value <= checker;
2639 }
2640 
2641 kmp_uint32
2642 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2643  kmp_uint32 checker,
2644  kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2645  , void * obj // Higher-level synchronization object, or NULL.
2646  )
2647 {
2648  // note: we may not belong to a team at this point
2649  register volatile kmp_uint32 * spin = spinner;
2650  register kmp_uint32 check = checker;
2651  register kmp_uint32 spins;
2652  register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2653  register kmp_uint32 r;
2654 
2655  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2656  KMP_INIT_YIELD( spins );
2657  // main wait spin loop
2658  while(!f(r = TCR_4(*spin), check)) {
2659  KMP_FSYNC_SPIN_PREPARE( obj );
2660  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2661  It causes problems with infinite recursion because of exit lock */
2662  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2663  __kmp_abort_thread(); */
2664 
2665  /* if we have waited a bit, or are oversubscribed, yield */
2666  /* pause is in the following code */
2667  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2668  KMP_YIELD_SPIN( spins );
2669  }
2670  KMP_FSYNC_SPIN_ACQUIRED( obj );
2671  return r;
2672 }
2673 
2674 void
2675 __kmp_wait_yield_4_ptr(void *spinner,
2676  kmp_uint32 checker,
2677  kmp_uint32 (*pred)( void *, kmp_uint32 ),
2678  void *obj // Higher-level synchronization object, or NULL.
2679  )
2680 {
2681  // note: we may not belong to a team at this point
2682  register void *spin = spinner;
2683  register kmp_uint32 check = checker;
2684  register kmp_uint32 spins;
2685  register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred;
2686 
2687  KMP_FSYNC_SPIN_INIT( obj, spin );
2688  KMP_INIT_YIELD( spins );
2689  // main wait spin loop
2690  while ( !f( spin, check ) ) {
2691  KMP_FSYNC_SPIN_PREPARE( obj );
2692  /* if we have waited a bit, or are oversubscribed, yield */
2693  /* pause is in the following code */
2694  KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc );
2695  KMP_YIELD_SPIN( spins );
2696  }
2697  KMP_FSYNC_SPIN_ACQUIRED( obj );
2698 }
2699 
2700 } // extern "C"
2701 
2702 #ifdef KMP_GOMP_COMPAT
2703 
2704 void
2705 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2706  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2707  kmp_int32 chunk, int push_ws )
2708 {
2709  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2710  push_ws );
2711 }
2712 
2713 void
2714 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2715  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2716  kmp_int32 chunk, int push_ws )
2717 {
2718  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2719  push_ws );
2720 }
2721 
2722 void
2723 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2724  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2725  kmp_int64 chunk, int push_ws )
2726 {
2727  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2728  push_ws );
2729 }
2730 
2731 void
2732 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2733  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2734  kmp_int64 chunk, int push_ws )
2735 {
2736  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2737  push_ws );
2738 }
2739 
2740 void
2741 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2742 {
2743  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2744 }
2745 
2746 void
2747 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2748 {
2749  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2750 }
2751 
2752 void
2753 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2754 {
2755  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2756 }
2757 
2758 void
2759 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2760 {
2761  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2762 }
2763 
2764 #endif /* KMP_GOMP_COMPAT */
2765 
2766 /* ------------------------------------------------------------------------ */
2767 /* ------------------------------------------------------------------------ */
2768 
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:765
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:777
Definition: kmp.h:194
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
sched_type
Definition: kmp.h:295
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)