LLVM OpenMP* Runtime Library
kmp_dispatch.h
1 /*
2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef KMP_DISPATCH_H
15 #define KMP_DISPATCH_H
16 
17 /* ------------------------------------------------------------------------ */
18 /* ------------------------------------------------------------------------ */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_OS_WINDOWS && KMP_ARCH_X86
27 #include <float.h>
28 #endif
29 
30 #if OMPT_SUPPORT
31 #include "ompt-internal.h"
32 #include "ompt-specific.h"
33 #endif
34 
35 /* ------------------------------------------------------------------------ */
36 /* ------------------------------------------------------------------------ */
37 #if KMP_USE_HIER_SCHED
38 // Forward declarations of some hierarchical scheduling data structures
39 template <typename T> struct kmp_hier_t;
40 template <typename T> struct kmp_hier_top_unit_t;
41 #endif // KMP_USE_HIER_SCHED
42 
43 template <typename T> struct dispatch_shared_info_template;
44 template <typename T> struct dispatch_private_info_template;
45 
46 template <typename T>
47 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
48  dispatch_private_info_template<T> *pr,
49  enum sched_type schedule, T lb, T ub,
50  typename traits_t<T>::signed_t st,
51 #if USE_ITT_BUILD
52  kmp_uint64 *cur_chunk,
53 #endif
54  typename traits_t<T>::signed_t chunk,
55  T nproc, T unit_id);
56 template <typename T>
57 extern int __kmp_dispatch_next_algorithm(
58  int gtid, dispatch_private_info_template<T> *pr,
59  dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
60  T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
61 
62 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
64 
65 #if KMP_STATIC_STEAL_ENABLED
66 
67 // replaces dispatch_private_info{32,64} structures and
68 // dispatch_private_info{32,64}_t types
69 template <typename T> struct dispatch_private_infoXX_template {
70  typedef typename traits_t<T>::unsigned_t UT;
71  typedef typename traits_t<T>::signed_t ST;
72  UT count; // unsigned
73  T ub;
74  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
75  T lb;
76  ST st; // signed
77  UT tc; // unsigned
78  T static_steal_counter; // for static_steal only; maybe better to put after ub
79 
80  /* parm[1-4] are used in different ways by different scheduling algorithms */
81 
82  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
83  // a) parm3 is properly aligned and
84  // b) all parm1-4 are in the same cache line.
85  // Because of parm1-4 are used together, performance seems to be better
86  // if they are in the same line (not measured though).
87 
88  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
89  T parm1;
90  T parm2;
91  T parm3;
92  T parm4;
93  };
94 
95  UT ordered_lower; // unsigned
96  UT ordered_upper; // unsigned
97 #if KMP_OS_WINDOWS
98  T last_upper;
99 #endif /* KMP_OS_WINDOWS */
100 };
101 
102 #else /* KMP_STATIC_STEAL_ENABLED */
103 
104 // replaces dispatch_private_info{32,64} structures and
105 // dispatch_private_info{32,64}_t types
106 template <typename T> struct dispatch_private_infoXX_template {
107  typedef typename traits_t<T>::unsigned_t UT;
108  typedef typename traits_t<T>::signed_t ST;
109  T lb;
110  T ub;
111  ST st; // signed
112  UT tc; // unsigned
113 
114  T parm1;
115  T parm2;
116  T parm3;
117  T parm4;
118 
119  UT count; // unsigned
120 
121  UT ordered_lower; // unsigned
122  UT ordered_upper; // unsigned
123 #if KMP_OS_WINDOWS
124  T last_upper;
125 #endif /* KMP_OS_WINDOWS */
126 };
127 #endif /* KMP_STATIC_STEAL_ENABLED */
128 
129 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
130  // duplicate alignment here, otherwise size of structure is not correct in our
131  // compiler
132  union KMP_ALIGN_CACHE private_info_tmpl {
133  dispatch_private_infoXX_template<T> p;
134  dispatch_private_info64_t p64;
135  } u;
136  enum sched_type schedule; /* scheduling algorithm */
137  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
138  kmp_uint32 ordered_bumped;
139  // to retain the structure size after making order
140  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
141  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
142  kmp_uint32 type_size;
143 #if KMP_USE_HIER_SCHED
144  kmp_int32 hier_id;
145  kmp_hier_top_unit_t<T> *hier_parent;
146  // member functions
147  kmp_int32 get_hier_id() const { return hier_id; }
148  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
149 #endif
150  enum cons_type pushed_ws;
151 };
152 
153 // replaces dispatch_shared_info{32,64} structures and
154 // dispatch_shared_info{32,64}_t types
155 template <typename T> struct dispatch_shared_infoXX_template {
156  typedef typename traits_t<T>::unsigned_t UT;
157  /* chunk index under dynamic, number of idle threads under static-steal;
158  iteration index otherwise */
159  volatile UT iteration;
160  volatile UT num_done;
161  volatile UT ordered_iteration;
162  // to retain the structure size making ordered_iteration scalar
163  UT ordered_dummy[KMP_MAX_ORDERED - 3];
164 };
165 
166 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
167 template <typename T> struct dispatch_shared_info_template {
168  typedef typename traits_t<T>::unsigned_t UT;
169  // we need union here to keep the structure size
170  union shared_info_tmpl {
171  dispatch_shared_infoXX_template<UT> s;
172  dispatch_shared_info64_t s64;
173  } u;
174  volatile kmp_uint32 buffer_index;
175 #if OMP_45_ENABLED
176  volatile kmp_int32 doacross_buf_idx; // teamwise index
177  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
178  kmp_int32 doacross_num_done; // count finished threads
179 #endif
180 #if KMP_USE_HIER_SCHED
181  kmp_hier_t<T> *hier;
182 #endif
183 #if KMP_USE_HWLOC
184  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
185  // machines (> 48 cores). Performance analysis showed that a cache thrash
186  // was occurring and this padding helps alleviate the problem.
187  char padding[64];
188 #endif
189 };
190 
191 /* ------------------------------------------------------------------------ */
192 /* ------------------------------------------------------------------------ */
193 
194 #undef USE_TEST_LOCKS
195 
196 // test_then_add template (general template should NOT be used)
197 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
198 
199 template <>
200 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
201  kmp_int32 d) {
202  kmp_int32 r;
203  r = KMP_TEST_THEN_ADD32(p, d);
204  return r;
205 }
206 
207 template <>
208 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
209  kmp_int64 d) {
210  kmp_int64 r;
211  r = KMP_TEST_THEN_ADD64(p, d);
212  return r;
213 }
214 
215 // test_then_inc_acq template (general template should NOT be used)
216 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
217 
218 template <>
219 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
220  kmp_int32 r;
221  r = KMP_TEST_THEN_INC_ACQ32(p);
222  return r;
223 }
224 
225 template <>
226 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
227  kmp_int64 r;
228  r = KMP_TEST_THEN_INC_ACQ64(p);
229  return r;
230 }
231 
232 // test_then_inc template (general template should NOT be used)
233 template <typename T> static __forceinline T test_then_inc(volatile T *p);
234 
235 template <>
236 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
237  kmp_int32 r;
238  r = KMP_TEST_THEN_INC32(p);
239  return r;
240 }
241 
242 template <>
243 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
244  kmp_int64 r;
245  r = KMP_TEST_THEN_INC64(p);
246  return r;
247 }
248 
249 // compare_and_swap template (general template should NOT be used)
250 template <typename T>
251 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
252 
253 template <>
254 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
255  kmp_int32 c, kmp_int32 s) {
256  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
257 }
258 
259 template <>
260 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
261  kmp_int64 c, kmp_int64 s) {
262  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
263 }
264 
265 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
266  return value >= checker;
267 }
268 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
269  return value == checker;
270 }
271 
272 /*
273  Spin wait loop that first does pause, then yield.
274  Waits until function returns non-zero when called with *spinner and check.
275  Does NOT put threads to sleep.
276  Arguments:
277  UT is unsigned 4- or 8-byte type
278  spinner - memory location to check value
279  checker - value which spinner is >, <, ==, etc.
280  pred - predicate function to perform binary comparison of some sort
281 #if USE_ITT_BUILD
282  obj -- is higher-level synchronization object to report to ittnotify. It
283  is used to report locks consistently. For example, if lock is acquired
284  immediately, its address is reported to ittnotify via
285  KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
286  and lock routine calls to KMP_WAIT_YIELD(), the later should report the
287  same address, not an address of low-level spinner.
288 #endif // USE_ITT_BUILD
289  TODO: make inline function (move to header file for icl)
290 */
291 template <typename UT>
292 static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
293  kmp_uint32 (*pred)(UT, UT)
294  USE_ITT_BUILD_ARG(void *obj)) {
295  // note: we may not belong to a team at this point
296  volatile UT *spin = spinner;
297  UT check = checker;
298  kmp_uint32 spins;
299  kmp_uint32 (*f)(UT, UT) = pred;
300  UT r;
301 
302  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
303  KMP_INIT_YIELD(spins);
304  // main wait spin loop
305  while (!f(r = *spin, check)) {
306  KMP_FSYNC_SPIN_PREPARE(obj);
307  /* GEH - remove this since it was accidentally introduced when kmp_wait was
308  split.
309  It causes problems with infinite recursion because of exit lock */
310  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311  __kmp_abort_thread(); */
312 
313  // if we are oversubscribed,
314  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315  // pause is in the following code
316  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
317  KMP_YIELD_SPIN(spins);
318  }
319  KMP_FSYNC_SPIN_ACQUIRED(obj);
320  return r;
321 }
322 
323 /* ------------------------------------------------------------------------ */
324 /* ------------------------------------------------------------------------ */
325 
326 template <typename UT>
327 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
328  dispatch_private_info_template<UT> *pr;
329 
330  int gtid = *gtid_ref;
331  // int cid = *cid_ref;
332  kmp_info_t *th = __kmp_threads[gtid];
333  KMP_DEBUG_ASSERT(th->th.th_dispatch);
334 
335  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336  if (__kmp_env_consistency_check) {
337  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338  th->th.th_dispatch->th_dispatch_pr_current);
339  if (pr->pushed_ws != ct_none) {
340 #if KMP_USE_DYNAMIC_LOCK
341  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342 #else
343  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344 #endif
345  }
346  }
347 
348  if (!th->th.th_team->t.t_serialized) {
349  dispatch_shared_info_template<UT> *sh =
350  reinterpret_cast<dispatch_shared_info_template<UT> *>(
351  th->th.th_dispatch->th_dispatch_sh_current);
352  UT lower;
353 
354  if (!__kmp_env_consistency_check) {
355  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356  th->th.th_dispatch->th_dispatch_pr_current);
357  }
358  lower = pr->u.p.ordered_lower;
359 
360 #if !defined(KMP_GOMP_COMPAT)
361  if (__kmp_env_consistency_check) {
362  if (pr->ordered_bumped) {
363  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365  ct_ordered_in_pdo, loc_ref,
366  &p->stack_data[p->w_top]);
367  }
368  }
369 #endif /* !defined(KMP_GOMP_COMPAT) */
370 
371  KMP_MB();
372 #ifdef KMP_DEBUG
373  {
374  char *buff;
375  // create format specifiers before the debug output
376  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377  "ordered_iter:%%%s lower:%%%s\n",
378  traits_t<UT>::spec, traits_t<UT>::spec);
379  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380  __kmp_str_free(&buff);
381  }
382 #endif
383  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
384  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
385  KMP_MB(); /* is this necessary? */
386 #ifdef KMP_DEBUG
387  {
388  char *buff;
389  // create format specifiers before the debug output
390  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
391  "ordered_iter:%%%s lower:%%%s\n",
392  traits_t<UT>::spec, traits_t<UT>::spec);
393  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
394  __kmp_str_free(&buff);
395  }
396 #endif
397  }
398  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
399 }
400 
401 template <typename UT>
402 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
403  typedef typename traits_t<UT>::signed_t ST;
404  dispatch_private_info_template<UT> *pr;
405 
406  int gtid = *gtid_ref;
407  // int cid = *cid_ref;
408  kmp_info_t *th = __kmp_threads[gtid];
409  KMP_DEBUG_ASSERT(th->th.th_dispatch);
410 
411  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
412  if (__kmp_env_consistency_check) {
413  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
414  th->th.th_dispatch->th_dispatch_pr_current);
415  if (pr->pushed_ws != ct_none) {
416  __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
417  }
418  }
419 
420  if (!th->th.th_team->t.t_serialized) {
421  dispatch_shared_info_template<UT> *sh =
422  reinterpret_cast<dispatch_shared_info_template<UT> *>(
423  th->th.th_dispatch->th_dispatch_sh_current);
424 
425  if (!__kmp_env_consistency_check) {
426  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427  th->th.th_dispatch->th_dispatch_pr_current);
428  }
429 
430  KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
431 #if !defined(KMP_GOMP_COMPAT)
432  if (__kmp_env_consistency_check) {
433  if (pr->ordered_bumped != 0) {
434  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
435  /* How to test it? - OM */
436  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
437  ct_ordered_in_pdo, loc_ref,
438  &p->stack_data[p->w_top]);
439  }
440  }
441 #endif /* !defined(KMP_GOMP_COMPAT) */
442 
443  KMP_MB(); /* Flush all pending memory write invalidates. */
444 
445  pr->ordered_bumped += 1;
446 
447  KD_TRACE(1000,
448  ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
449  gtid, pr->ordered_bumped));
450 
451  KMP_MB(); /* Flush all pending memory write invalidates. */
452 
453  /* TODO use general release procedure? */
454  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
455 
456  KMP_MB(); /* Flush all pending memory write invalidates. */
457  }
458  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
459 }
460 
461 /* Computes and returns x to the power of y, where y must a non-negative integer
462  */
463 template <typename UT>
464 static __forceinline long double __kmp_pow(long double x, UT y) {
465  long double s = 1.0L;
466 
467  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
468  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
469  while (y) {
470  if (y & 1)
471  s *= x;
472  x *= x;
473  y >>= 1;
474  }
475  return s;
476 }
477 
478 /* Computes and returns the number of unassigned iterations after idx chunks
479  have been assigned
480  (the total number of unassigned iterations in chunks with index greater than
481  or equal to idx).
482  __forceinline seems to be broken so that if we __forceinline this function,
483  the behavior is wrong
484  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
485 */
486 template <typename T>
487 static __inline typename traits_t<T>::unsigned_t
488 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
489  typename traits_t<T>::unsigned_t idx) {
490  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
491  least for ICL 8.1, long double arithmetic may not really have
492  long double precision, even with /Qlong_double. Currently, we
493  workaround that in the caller code, by manipulating the FPCW for
494  Windows* OS on IA-32 architecture. The lack of precision is not
495  expected to be a correctness issue, though.
496  */
497  typedef typename traits_t<T>::unsigned_t UT;
498 
499  long double x = tc * __kmp_pow<UT>(base, idx);
500  UT r = (UT)x;
501  if (x == r)
502  return r;
503  return r + 1;
504 }
505 
506 // Parameters of the guided-iterative algorithm:
507 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
508 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
509 // by default n = 2. For example with n = 3 the chunks distribution will be more
510 // flat.
511 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
512 static const int guided_int_param = 2;
513 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
514 #endif // KMP_DISPATCH_H
sched_type
Definition: kmp.h:337
Definition: kmp.h:224