LLVM OpenMP* Runtime Library
kmp_dispatch.h
1 /*
2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef KMP_DISPATCH_H
15 #define KMP_DISPATCH_H
16 
17 /* ------------------------------------------------------------------------ */
18 /* ------------------------------------------------------------------------ */
19 
20 // Need to raise Win version from XP to Vista here for support of
21 // InterlockedExchange64
22 #if defined(_WIN32_WINNT) && defined(_M_IX86)
23 #undef _WIN32_WINNT
24 #define _WIN32_WINNT 0x0502
25 #endif
26 
27 #include "kmp.h"
28 #include "kmp_error.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_stats.h"
32 #include "kmp_str.h"
33 #if KMP_OS_WINDOWS && KMP_ARCH_X86
34 #include <float.h>
35 #endif
36 
37 #if OMPT_SUPPORT
38 #include "ompt-internal.h"
39 #include "ompt-specific.h"
40 #endif
41 
42 /* ------------------------------------------------------------------------ */
43 /* ------------------------------------------------------------------------ */
44 #if KMP_USE_HIER_SCHED
45 // Forward declarations of some hierarchical scheduling data structures
46 template <typename T> struct kmp_hier_t;
47 template <typename T> struct kmp_hier_top_unit_t;
48 #endif // KMP_USE_HIER_SCHED
49 
50 template <typename T> struct dispatch_shared_info_template;
51 template <typename T> struct dispatch_private_info_template;
52 
53 template <typename T>
54 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
55  dispatch_private_info_template<T> *pr,
56  enum sched_type schedule, T lb, T ub,
57  typename traits_t<T>::signed_t st,
58 #if USE_ITT_BUILD
59  kmp_uint64 *cur_chunk,
60 #endif
61  typename traits_t<T>::signed_t chunk,
62  T nproc, T unit_id);
63 template <typename T>
64 extern int __kmp_dispatch_next_algorithm(
65  int gtid, dispatch_private_info_template<T> *pr,
66  dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
67  T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
68 
69 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
70 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
71 
72 #if KMP_STATIC_STEAL_ENABLED
73 
74 // replaces dispatch_private_info{32,64} structures and
75 // dispatch_private_info{32,64}_t types
76 template <typename T> struct dispatch_private_infoXX_template {
77  typedef typename traits_t<T>::unsigned_t UT;
78  typedef typename traits_t<T>::signed_t ST;
79  UT count; // unsigned
80  T ub;
81  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
82  T lb;
83  ST st; // signed
84  UT tc; // unsigned
85  T static_steal_counter; // for static_steal only; maybe better to put after ub
86 
87  /* parm[1-4] are used in different ways by different scheduling algorithms */
88 
89  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
90  // a) parm3 is properly aligned and
91  // b) all parm1-4 are in the same cache line.
92  // Because of parm1-4 are used together, performance seems to be better
93  // if they are in the same line (not measured though).
94 
95  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
96  T parm1;
97  T parm2;
98  T parm3;
99  T parm4;
100  };
101 
102  UT ordered_lower; // unsigned
103  UT ordered_upper; // unsigned
104 #if KMP_OS_WINDOWS
105  T last_upper;
106 #endif /* KMP_OS_WINDOWS */
107 };
108 
109 #else /* KMP_STATIC_STEAL_ENABLED */
110 
111 // replaces dispatch_private_info{32,64} structures and
112 // dispatch_private_info{32,64}_t types
113 template <typename T> struct dispatch_private_infoXX_template {
114  typedef typename traits_t<T>::unsigned_t UT;
115  typedef typename traits_t<T>::signed_t ST;
116  T lb;
117  T ub;
118  ST st; // signed
119  UT tc; // unsigned
120 
121  T parm1;
122  T parm2;
123  T parm3;
124  T parm4;
125 
126  UT count; // unsigned
127 
128  UT ordered_lower; // unsigned
129  UT ordered_upper; // unsigned
130 #if KMP_OS_WINDOWS
131  T last_upper;
132 #endif /* KMP_OS_WINDOWS */
133 };
134 #endif /* KMP_STATIC_STEAL_ENABLED */
135 
136 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
137  // duplicate alignment here, otherwise size of structure is not correct in our
138  // compiler
139  union KMP_ALIGN_CACHE private_info_tmpl {
140  dispatch_private_infoXX_template<T> p;
141  dispatch_private_info64_t p64;
142  } u;
143  enum sched_type schedule; /* scheduling algorithm */
144  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
145  kmp_uint32 ordered_bumped;
146  // to retain the structure size after making order
147  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
148  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
149  kmp_uint32 type_size;
150 #if KMP_USE_HIER_SCHED
151  kmp_int32 hier_id;
152  kmp_hier_top_unit_t<T> *hier_parent;
153  // member functions
154  kmp_int32 get_hier_id() const { return hier_id; }
155  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
156 #endif
157  enum cons_type pushed_ws;
158 };
159 
160 // replaces dispatch_shared_info{32,64} structures and
161 // dispatch_shared_info{32,64}_t types
162 template <typename T> struct dispatch_shared_infoXX_template {
163  typedef typename traits_t<T>::unsigned_t UT;
164  /* chunk index under dynamic, number of idle threads under static-steal;
165  iteration index otherwise */
166  volatile UT iteration;
167  volatile UT num_done;
168  volatile UT ordered_iteration;
169  // to retain the structure size making ordered_iteration scalar
170  UT ordered_dummy[KMP_MAX_ORDERED - 3];
171 };
172 
173 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
174 template <typename T> struct dispatch_shared_info_template {
175  typedef typename traits_t<T>::unsigned_t UT;
176  // we need union here to keep the structure size
177  union shared_info_tmpl {
178  dispatch_shared_infoXX_template<UT> s;
179  dispatch_shared_info64_t s64;
180  } u;
181  volatile kmp_uint32 buffer_index;
182 #if OMP_45_ENABLED
183  volatile kmp_int32 doacross_buf_idx; // teamwise index
184  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
185  kmp_int32 doacross_num_done; // count finished threads
186 #endif
187 #if KMP_USE_HIER_SCHED
188  kmp_hier_t<T> *hier;
189 #endif
190 #if KMP_USE_HWLOC
191  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
192  // machines (> 48 cores). Performance analysis showed that a cache thrash
193  // was occurring and this padding helps alleviate the problem.
194  char padding[64];
195 #endif
196 };
197 
198 /* ------------------------------------------------------------------------ */
199 /* ------------------------------------------------------------------------ */
200 
201 #undef USE_TEST_LOCKS
202 
203 // test_then_add template (general template should NOT be used)
204 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
205 
206 template <>
207 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
208  kmp_int32 d) {
209  kmp_int32 r;
210  r = KMP_TEST_THEN_ADD32(p, d);
211  return r;
212 }
213 
214 template <>
215 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
216  kmp_int64 d) {
217  kmp_int64 r;
218  r = KMP_TEST_THEN_ADD64(p, d);
219  return r;
220 }
221 
222 // test_then_inc_acq template (general template should NOT be used)
223 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
224 
225 template <>
226 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
227  kmp_int32 r;
228  r = KMP_TEST_THEN_INC_ACQ32(p);
229  return r;
230 }
231 
232 template <>
233 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
234  kmp_int64 r;
235  r = KMP_TEST_THEN_INC_ACQ64(p);
236  return r;
237 }
238 
239 // test_then_inc template (general template should NOT be used)
240 template <typename T> static __forceinline T test_then_inc(volatile T *p);
241 
242 template <>
243 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
244  kmp_int32 r;
245  r = KMP_TEST_THEN_INC32(p);
246  return r;
247 }
248 
249 template <>
250 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
251  kmp_int64 r;
252  r = KMP_TEST_THEN_INC64(p);
253  return r;
254 }
255 
256 // compare_and_swap template (general template should NOT be used)
257 template <typename T>
258 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
259 
260 template <>
261 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
262  kmp_int32 c, kmp_int32 s) {
263  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
264 }
265 
266 template <>
267 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
268  kmp_int64 c, kmp_int64 s) {
269  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
270 }
271 
272 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
273  return value >= checker;
274 }
275 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
276  return value == checker;
277 }
278 
279 /*
280  Spin wait loop that first does pause, then yield.
281  Waits until function returns non-zero when called with *spinner and check.
282  Does NOT put threads to sleep.
283  Arguments:
284  UT is unsigned 4- or 8-byte type
285  spinner - memory location to check value
286  checker - value which spinner is >, <, ==, etc.
287  pred - predicate function to perform binary comparison of some sort
288 #if USE_ITT_BUILD
289  obj -- is higher-level synchronization object to report to ittnotify. It
290  is used to report locks consistently. For example, if lock is acquired
291  immediately, its address is reported to ittnotify via
292  KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
293  and lock routine calls to KMP_WAIT_YIELD(), the later should report the
294  same address, not an address of low-level spinner.
295 #endif // USE_ITT_BUILD
296  TODO: make inline function (move to header file for icl)
297 */
298 template <typename UT>
299 static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
300  kmp_uint32 (*pred)(UT, UT)
301  USE_ITT_BUILD_ARG(void *obj)) {
302  // note: we may not belong to a team at this point
303  volatile UT *spin = spinner;
304  UT check = checker;
305  kmp_uint32 spins;
306  kmp_uint32 (*f)(UT, UT) = pred;
307  UT r;
308 
309  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
310  KMP_INIT_YIELD(spins);
311  // main wait spin loop
312  while (!f(r = *spin, check)) {
313  KMP_FSYNC_SPIN_PREPARE(obj);
314  /* GEH - remove this since it was accidentally introduced when kmp_wait was
315  split.
316  It causes problems with infinite recursion because of exit lock */
317  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
318  __kmp_abort_thread(); */
319 
320  // if we are oversubscribed,
321  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
322  // pause is in the following code
323  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
324  KMP_YIELD_SPIN(spins);
325  }
326  KMP_FSYNC_SPIN_ACQUIRED(obj);
327  return r;
328 }
329 
330 /* ------------------------------------------------------------------------ */
331 /* ------------------------------------------------------------------------ */
332 
333 template <typename UT>
334 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
335  typedef typename traits_t<UT>::signed_t ST;
336  dispatch_private_info_template<UT> *pr;
337 
338  int gtid = *gtid_ref;
339  // int cid = *cid_ref;
340  kmp_info_t *th = __kmp_threads[gtid];
341  KMP_DEBUG_ASSERT(th->th.th_dispatch);
342 
343  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
344  if (__kmp_env_consistency_check) {
345  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
346  th->th.th_dispatch->th_dispatch_pr_current);
347  if (pr->pushed_ws != ct_none) {
348 #if KMP_USE_DYNAMIC_LOCK
349  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
350 #else
351  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
352 #endif
353  }
354  }
355 
356  if (!th->th.th_team->t.t_serialized) {
357  dispatch_shared_info_template<UT> *sh =
358  reinterpret_cast<dispatch_shared_info_template<UT> *>(
359  th->th.th_dispatch->th_dispatch_sh_current);
360  UT lower;
361 
362  if (!__kmp_env_consistency_check) {
363  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
364  th->th.th_dispatch->th_dispatch_pr_current);
365  }
366  lower = pr->u.p.ordered_lower;
367 
368 #if !defined(KMP_GOMP_COMPAT)
369  if (__kmp_env_consistency_check) {
370  if (pr->ordered_bumped) {
371  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
372  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
373  ct_ordered_in_pdo, loc_ref,
374  &p->stack_data[p->w_top]);
375  }
376  }
377 #endif /* !defined(KMP_GOMP_COMPAT) */
378 
379  KMP_MB();
380 #ifdef KMP_DEBUG
381  {
382  char *buff;
383  // create format specifiers before the debug output
384  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
385  "ordered_iter:%%%s lower:%%%s\n",
386  traits_t<UT>::spec, traits_t<UT>::spec);
387  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
388  __kmp_str_free(&buff);
389  }
390 #endif
391  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
392  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
393  KMP_MB(); /* is this necessary? */
394 #ifdef KMP_DEBUG
395  {
396  char *buff;
397  // create format specifiers before the debug output
398  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
399  "ordered_iter:%%%s lower:%%%s\n",
400  traits_t<UT>::spec, traits_t<UT>::spec);
401  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
402  __kmp_str_free(&buff);
403  }
404 #endif
405  }
406  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
407 }
408 
409 template <typename UT>
410 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
411  typedef typename traits_t<UT>::signed_t ST;
412  dispatch_private_info_template<UT> *pr;
413 
414  int gtid = *gtid_ref;
415  // int cid = *cid_ref;
416  kmp_info_t *th = __kmp_threads[gtid];
417  KMP_DEBUG_ASSERT(th->th.th_dispatch);
418 
419  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
420  if (__kmp_env_consistency_check) {
421  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
422  th->th.th_dispatch->th_dispatch_pr_current);
423  if (pr->pushed_ws != ct_none) {
424  __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
425  }
426  }
427 
428  if (!th->th.th_team->t.t_serialized) {
429  dispatch_shared_info_template<UT> *sh =
430  reinterpret_cast<dispatch_shared_info_template<UT> *>(
431  th->th.th_dispatch->th_dispatch_sh_current);
432 
433  if (!__kmp_env_consistency_check) {
434  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
435  th->th.th_dispatch->th_dispatch_pr_current);
436  }
437 
438  KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
439 #if !defined(KMP_GOMP_COMPAT)
440  if (__kmp_env_consistency_check) {
441  if (pr->ordered_bumped != 0) {
442  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
443  /* How to test it? - OM */
444  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
445  ct_ordered_in_pdo, loc_ref,
446  &p->stack_data[p->w_top]);
447  }
448  }
449 #endif /* !defined(KMP_GOMP_COMPAT) */
450 
451  KMP_MB(); /* Flush all pending memory write invalidates. */
452 
453  pr->ordered_bumped += 1;
454 
455  KD_TRACE(1000,
456  ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
457  gtid, pr->ordered_bumped));
458 
459  KMP_MB(); /* Flush all pending memory write invalidates. */
460 
461  /* TODO use general release procedure? */
462  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
463 
464  KMP_MB(); /* Flush all pending memory write invalidates. */
465  }
466  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
467 }
468 
469 /* Computes and returns x to the power of y, where y must a non-negative integer
470  */
471 template <typename UT>
472 static __forceinline long double __kmp_pow(long double x, UT y) {
473  long double s = 1.0L;
474 
475  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
476  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
477  while (y) {
478  if (y & 1)
479  s *= x;
480  x *= x;
481  y >>= 1;
482  }
483  return s;
484 }
485 
486 /* Computes and returns the number of unassigned iterations after idx chunks
487  have been assigned
488  (the total number of unassigned iterations in chunks with index greater than
489  or equal to idx).
490  __forceinline seems to be broken so that if we __forceinline this function,
491  the behavior is wrong
492  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
493 */
494 template <typename T>
495 static __inline typename traits_t<T>::unsigned_t
496 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
497  typename traits_t<T>::unsigned_t idx) {
498  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
499  least for ICL 8.1, long double arithmetic may not really have
500  long double precision, even with /Qlong_double. Currently, we
501  workaround that in the caller code, by manipulating the FPCW for
502  Windows* OS on IA-32 architecture. The lack of precision is not
503  expected to be a correctness issue, though.
504  */
505  typedef typename traits_t<T>::unsigned_t UT;
506 
507  long double x = tc * __kmp_pow<UT>(base, idx);
508  UT r = (UT)x;
509  if (x == r)
510  return r;
511  return r + 1;
512 }
513 
514 // Parameters of the guided-iterative algorithm:
515 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
516 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
517 // by default n = 2. For example with n = 3 the chunks distribution will be more
518 // flat.
519 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
520 static const int guided_int_param = 2;
521 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
522 #endif // KMP_DISPATCH_H
sched_type
Definition: kmp.h:320
Definition: kmp.h:207