LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_wait_release.h"
16 #include "kmp_itt.h"
17 #include "kmp_os.h"
18 #include "kmp_stats.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #include "tsan_annotations.h"
29 
30 #if KMP_MIC && USE_NGO_STORES
31 // ICV copying
32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
36 #else
37 #define ngo_load(src) ((void)0)
38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
40 #define ngo_sync() ((void)0)
41 #endif /* KMP_MIC && USE_NGO_STORES */
42 
43 void __kmp_print_structure(void); // Forward declaration
44 
45 // ---------------------------- Barrier Algorithms ----------------------------
46 
47 // Linear Barrier
48 static void __kmp_linear_barrier_gather(
49  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52  kmp_team_t *team = this_thr->th.th_team;
53  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54  kmp_info_t **other_threads = team->t.t_threads;
55 
56  KA_TRACE(
57  20,
58  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59  gtid, team->t.t_id, tid, bt));
60  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63  // Barrier imbalance - save arrive time to the thread
64  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66  __itt_get_timestamp();
67  }
68 #endif
69  // We now perform a linear reduction to signal that all of the threads have
70  // arrived.
71  if (!KMP_MASTER_TID(tid)) {
72  KA_TRACE(20,
73  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74  "arrived(%p): %llu => %llu\n",
75  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78  // Mark arrival to master thread
79  /* After performing this write, a worker thread may not assume that the team
80  is valid any more - it could be deallocated by the master thread at any
81  time. */
82  ANNOTATE_BARRIER_BEGIN(this_thr);
83  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84  flag.release();
85  } else {
86  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87  int nproc = this_thr->th.th_team_nproc;
88  int i;
89  // Don't have to worry about sleep bit here or atomic since team setting
90  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 
92  // Collect all the worker team member threads.
93  for (i = 1; i < nproc; ++i) {
94 #if KMP_CACHE_MANAGE
95  // Prefetch next thread's arrived count
96  if (i + 1 < nproc)
97  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
98 #endif /* KMP_CACHE_MANAGE */
99  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100  "arrived(%p) == %llu\n",
101  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
102  team->t.t_id, i,
103  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
104 
105  // Wait for worker thread to arrive
106  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
107  new_state);
108  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109  ANNOTATE_BARRIER_END(other_threads[i]);
110 #if USE_ITT_BUILD && USE_ITT_NOTIFY
111  // Barrier imbalance - write min of the thread time and the other thread
112  // time to the thread.
113  if (__kmp_forkjoin_frames_mode == 2) {
114  this_thr->th.th_bar_min_time = KMP_MIN(
115  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
116  }
117 #endif
118  if (reduce) {
119  KA_TRACE(100,
120  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
121  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
122  team->t.t_id, i));
123  ANNOTATE_REDUCE_AFTER(reduce);
124  (*reduce)(this_thr->th.th_local.reduce_data,
125  other_threads[i]->th.th_local.reduce_data);
126  ANNOTATE_REDUCE_BEFORE(reduce);
127  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
128  }
129  }
130  // Don't have to worry about sleep bit here or atomic since team setting
131  team_bar->b_arrived = new_state;
132  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
133  "arrived(%p) = %llu\n",
134  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
135  new_state));
136  }
137  KA_TRACE(
138  20,
139  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
140  gtid, team->t.t_id, tid, bt));
141 }
142 
143 static void __kmp_linear_barrier_release(
144  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
145  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
146  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
147  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
148  kmp_team_t *team;
149 
150  if (KMP_MASTER_TID(tid)) {
151  unsigned int i;
152  kmp_uint32 nproc = this_thr->th.th_team_nproc;
153  kmp_info_t **other_threads;
154 
155  team = __kmp_threads[gtid]->th.th_team;
156  KMP_DEBUG_ASSERT(team != NULL);
157  other_threads = team->t.t_threads;
158 
159  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
160  "barrier type %d\n",
161  gtid, team->t.t_id, tid, bt));
162 
163  if (nproc > 1) {
164 #if KMP_BARRIER_ICV_PUSH
165  {
166  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
167  if (propagate_icvs) {
168  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
169  for (i = 1; i < nproc; ++i) {
170  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
171  team, i, FALSE);
172  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
173  &team->t.t_implicit_task_taskdata[0].td_icvs);
174  }
175  ngo_sync();
176  }
177  }
178 #endif // KMP_BARRIER_ICV_PUSH
179 
180  // Now, release all of the worker threads
181  for (i = 1; i < nproc; ++i) {
182 #if KMP_CACHE_MANAGE
183  // Prefetch next thread's go flag
184  if (i + 1 < nproc)
185  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
186 #endif /* KMP_CACHE_MANAGE */
187  KA_TRACE(
188  20,
189  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
190  "go(%p): %u => %u\n",
191  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
192  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
193  other_threads[i]->th.th_bar[bt].bb.b_go,
194  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
195  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
196  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
197  other_threads[i]);
198  flag.release();
199  }
200  }
201  } else { // Wait for the MASTER thread to release us
202  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
203  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
204  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
205  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
206  ANNOTATE_BARRIER_END(this_thr);
207 #if USE_ITT_BUILD && USE_ITT_NOTIFY
208  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
209  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
210  // disabled)
211  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
212  // Cancel wait on previous parallel region...
213  __kmp_itt_task_starting(itt_sync_obj);
214 
215  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
216  return;
217 
218  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
219  if (itt_sync_obj != NULL)
220  // Call prepare as early as possible for "new" barrier
221  __kmp_itt_task_finished(itt_sync_obj);
222  } else
223 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
224  // Early exit for reaping threads releasing forkjoin barrier
225  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
226  return;
227 // The worker thread may now assume that the team is valid.
228 #ifdef KMP_DEBUG
229  tid = __kmp_tid_from_gtid(gtid);
230  team = __kmp_threads[gtid]->th.th_team;
231 #endif
232  KMP_DEBUG_ASSERT(team != NULL);
233  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
234  KA_TRACE(20,
235  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
236  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
237  KMP_MB(); // Flush all pending memory write invalidates.
238  }
239  KA_TRACE(
240  20,
241  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
242  gtid, team->t.t_id, tid, bt));
243 }
244 
245 // Tree barrier
246 static void
247 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
248  int tid, void (*reduce)(void *, void *)
249  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
250  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
251  kmp_team_t *team = this_thr->th.th_team;
252  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
253  kmp_info_t **other_threads = team->t.t_threads;
254  kmp_uint32 nproc = this_thr->th.th_team_nproc;
255  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
256  kmp_uint32 branch_factor = 1 << branch_bits;
257  kmp_uint32 child;
258  kmp_uint32 child_tid;
259  kmp_uint64 new_state;
260 
261  KA_TRACE(
262  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
263  gtid, team->t.t_id, tid, bt));
264  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
265 
266 #if USE_ITT_BUILD && USE_ITT_NOTIFY
267  // Barrier imbalance - save arrive time to the thread
268  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
269  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
270  __itt_get_timestamp();
271  }
272 #endif
273  // Perform tree gather to wait until all threads have arrived; reduce any
274  // required data as we go
275  child_tid = (tid << branch_bits) + 1;
276  if (child_tid < nproc) {
277  // Parent threads wait for all their children to arrive
278  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
279  child = 1;
280  do {
281  kmp_info_t *child_thr = other_threads[child_tid];
282  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
283 #if KMP_CACHE_MANAGE
284  // Prefetch next thread's arrived count
285  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
286  KMP_CACHE_PREFETCH(
287  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
288 #endif /* KMP_CACHE_MANAGE */
289  KA_TRACE(20,
290  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
291  "arrived(%p) == %llu\n",
292  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
293  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
294  // Wait for child to arrive
295  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
296  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
297  ANNOTATE_BARRIER_END(child_thr);
298 #if USE_ITT_BUILD && USE_ITT_NOTIFY
299  // Barrier imbalance - write min of the thread time and a child time to
300  // the thread.
301  if (__kmp_forkjoin_frames_mode == 2) {
302  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
303  child_thr->th.th_bar_min_time);
304  }
305 #endif
306  if (reduce) {
307  KA_TRACE(100,
308  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
309  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
310  team->t.t_id, child_tid));
311  ANNOTATE_REDUCE_AFTER(reduce);
312  (*reduce)(this_thr->th.th_local.reduce_data,
313  child_thr->th.th_local.reduce_data);
314  ANNOTATE_REDUCE_BEFORE(reduce);
315  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
316  }
317  child++;
318  child_tid++;
319  } while (child <= branch_factor && child_tid < nproc);
320  }
321 
322  if (!KMP_MASTER_TID(tid)) { // Worker threads
323  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
324 
325  KA_TRACE(20,
326  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
327  "arrived(%p): %llu => %llu\n",
328  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
329  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
330  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
331 
332  // Mark arrival to parent thread
333  /* After performing this write, a worker thread may not assume that the team
334  is valid any more - it could be deallocated by the master thread at any
335  time. */
336  ANNOTATE_BARRIER_BEGIN(this_thr);
337  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
338  flag.release();
339  } else {
340  // Need to update the team arrived pointer if we are the master thread
341  if (nproc > 1) // New value was already computed above
342  team->t.t_bar[bt].b_arrived = new_state;
343  else
344  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
345  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
346  "arrived(%p) = %llu\n",
347  gtid, team->t.t_id, tid, team->t.t_id,
348  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
349  }
350  KA_TRACE(20,
351  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
352  gtid, team->t.t_id, tid, bt));
353 }
354 
355 static void __kmp_tree_barrier_release(
356  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
357  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
358  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
359  kmp_team_t *team;
360  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
361  kmp_uint32 nproc;
362  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
363  kmp_uint32 branch_factor = 1 << branch_bits;
364  kmp_uint32 child;
365  kmp_uint32 child_tid;
366 
367  // Perform a tree release for all of the threads that have been gathered
368  if (!KMP_MASTER_TID(
369  tid)) { // Handle fork barrier workers who aren't part of a team yet
370  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
371  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
372  // Wait for parent thread to release us
373  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
374  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
375  ANNOTATE_BARRIER_END(this_thr);
376 #if USE_ITT_BUILD && USE_ITT_NOTIFY
377  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
378  // In fork barrier where we could not get the object reliably (or
379  // ITTNOTIFY is disabled)
380  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
381  // Cancel wait on previous parallel region...
382  __kmp_itt_task_starting(itt_sync_obj);
383 
384  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
385  return;
386 
387  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
388  if (itt_sync_obj != NULL)
389  // Call prepare as early as possible for "new" barrier
390  __kmp_itt_task_finished(itt_sync_obj);
391  } else
392 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
393  // Early exit for reaping threads releasing forkjoin barrier
394  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
395  return;
396 
397  // The worker thread may now assume that the team is valid.
398  team = __kmp_threads[gtid]->th.th_team;
399  KMP_DEBUG_ASSERT(team != NULL);
400  tid = __kmp_tid_from_gtid(gtid);
401 
402  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
403  KA_TRACE(20,
404  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
405  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
406  KMP_MB(); // Flush all pending memory write invalidates.
407  } else {
408  team = __kmp_threads[gtid]->th.th_team;
409  KMP_DEBUG_ASSERT(team != NULL);
410  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
411  "barrier type %d\n",
412  gtid, team->t.t_id, tid, bt));
413  }
414  nproc = this_thr->th.th_team_nproc;
415  child_tid = (tid << branch_bits) + 1;
416 
417  if (child_tid < nproc) {
418  kmp_info_t **other_threads = team->t.t_threads;
419  child = 1;
420  // Parent threads release all their children
421  do {
422  kmp_info_t *child_thr = other_threads[child_tid];
423  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
424 #if KMP_CACHE_MANAGE
425  // Prefetch next thread's go count
426  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
427  KMP_CACHE_PREFETCH(
428  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
429 #endif /* KMP_CACHE_MANAGE */
430 
431 #if KMP_BARRIER_ICV_PUSH
432  {
433  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
434  if (propagate_icvs) {
435  __kmp_init_implicit_task(team->t.t_ident,
436  team->t.t_threads[child_tid], team,
437  child_tid, FALSE);
438  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
439  &team->t.t_implicit_task_taskdata[0].td_icvs);
440  }
441  }
442 #endif // KMP_BARRIER_ICV_PUSH
443  KA_TRACE(20,
444  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
445  "go(%p): %u => %u\n",
446  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
447  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
448  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
449  // Release child from barrier
450  ANNOTATE_BARRIER_BEGIN(child_thr);
451  kmp_flag_64 flag(&child_bar->b_go, child_thr);
452  flag.release();
453  child++;
454  child_tid++;
455  } while (child <= branch_factor && child_tid < nproc);
456  }
457  KA_TRACE(
458  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
459  gtid, team->t.t_id, tid, bt));
460 }
461 
462 // Hyper Barrier
463 static void
464 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
465  int tid, void (*reduce)(void *, void *)
466  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
467  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
468  kmp_team_t *team = this_thr->th.th_team;
469  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
470  kmp_info_t **other_threads = team->t.t_threads;
471  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
472  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
473  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
474  kmp_uint32 branch_factor = 1 << branch_bits;
475  kmp_uint32 offset;
476  kmp_uint32 level;
477 
478  KA_TRACE(
479  20,
480  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
481  gtid, team->t.t_id, tid, bt));
482  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
483 
484 #if USE_ITT_BUILD && USE_ITT_NOTIFY
485  // Barrier imbalance - save arrive time to the thread
486  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
487  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
488  __itt_get_timestamp();
489  }
490 #endif
491  /* Perform a hypercube-embedded tree gather to wait until all of the threads
492  have arrived, and reduce any required data as we go. */
493  kmp_flag_64 p_flag(&thr_bar->b_arrived);
494  for (level = 0, offset = 1; offset < num_threads;
495  level += branch_bits, offset <<= branch_bits) {
496  kmp_uint32 child;
497  kmp_uint32 child_tid;
498 
499  if (((tid >> level) & (branch_factor - 1)) != 0) {
500  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
501 
502  KA_TRACE(20,
503  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
504  "arrived(%p): %llu => %llu\n",
505  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
506  team->t.t_id, parent_tid, &thr_bar->b_arrived,
507  thr_bar->b_arrived,
508  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
509  // Mark arrival to parent thread
510  /* After performing this write (in the last iteration of the enclosing for
511  loop), a worker thread may not assume that the team is valid any more
512  - it could be deallocated by the master thread at any time. */
513  ANNOTATE_BARRIER_BEGIN(this_thr);
514  p_flag.set_waiter(other_threads[parent_tid]);
515  p_flag.release();
516  break;
517  }
518 
519  // Parent threads wait for children to arrive
520  if (new_state == KMP_BARRIER_UNUSED_STATE)
521  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
522  for (child = 1, child_tid = tid + (1 << level);
523  child < branch_factor && child_tid < num_threads;
524  child++, child_tid += (1 << level)) {
525  kmp_info_t *child_thr = other_threads[child_tid];
526  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
527 #if KMP_CACHE_MANAGE
528  kmp_uint32 next_child_tid = child_tid + (1 << level);
529  // Prefetch next thread's arrived count
530  if (child + 1 < branch_factor && next_child_tid < num_threads)
531  KMP_CACHE_PREFETCH(
532  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
533 #endif /* KMP_CACHE_MANAGE */
534  KA_TRACE(20,
535  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
536  "arrived(%p) == %llu\n",
537  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
538  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
539  // Wait for child to arrive
540  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
541  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
542  ANNOTATE_BARRIER_END(child_thr);
543 #if USE_ITT_BUILD && USE_ITT_NOTIFY
544  // Barrier imbalance - write min of the thread time and a child time to
545  // the thread.
546  if (__kmp_forkjoin_frames_mode == 2) {
547  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
548  child_thr->th.th_bar_min_time);
549  }
550 #endif
551  if (reduce) {
552  KA_TRACE(100,
553  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
554  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
555  team->t.t_id, child_tid));
556  ANNOTATE_REDUCE_AFTER(reduce);
557  (*reduce)(this_thr->th.th_local.reduce_data,
558  child_thr->th.th_local.reduce_data);
559  ANNOTATE_REDUCE_BEFORE(reduce);
560  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
561  }
562  }
563  }
564 
565  if (KMP_MASTER_TID(tid)) {
566  // Need to update the team arrived pointer if we are the master thread
567  if (new_state == KMP_BARRIER_UNUSED_STATE)
568  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
569  else
570  team->t.t_bar[bt].b_arrived = new_state;
571  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
572  "arrived(%p) = %llu\n",
573  gtid, team->t.t_id, tid, team->t.t_id,
574  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
575  }
576  KA_TRACE(
577  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
578  gtid, team->t.t_id, tid, bt));
579 }
580 
581 // The reverse versions seem to beat the forward versions overall
582 #define KMP_REVERSE_HYPER_BAR
583 static void __kmp_hyper_barrier_release(
584  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
585  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
586  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
587  kmp_team_t *team;
588  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
589  kmp_info_t **other_threads;
590  kmp_uint32 num_threads;
591  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
592  kmp_uint32 branch_factor = 1 << branch_bits;
593  kmp_uint32 child;
594  kmp_uint32 child_tid;
595  kmp_uint32 offset;
596  kmp_uint32 level;
597 
598  /* Perform a hypercube-embedded tree release for all of the threads that have
599  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
600  are released in the reverse order of the corresponding gather, otherwise
601  threads are released in the same order. */
602  if (KMP_MASTER_TID(tid)) { // master
603  team = __kmp_threads[gtid]->th.th_team;
604  KMP_DEBUG_ASSERT(team != NULL);
605  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
606  "barrier type %d\n",
607  gtid, team->t.t_id, tid, bt));
608 #if KMP_BARRIER_ICV_PUSH
609  if (propagate_icvs) { // master already has ICVs in final destination; copy
610  copy_icvs(&thr_bar->th_fixed_icvs,
611  &team->t.t_implicit_task_taskdata[tid].td_icvs);
612  }
613 #endif
614  } else { // Handle fork barrier workers who aren't part of a team yet
615  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
616  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
617  // Wait for parent thread to release us
618  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
619  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
620  ANNOTATE_BARRIER_END(this_thr);
621 #if USE_ITT_BUILD && USE_ITT_NOTIFY
622  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
623  // In fork barrier where we could not get the object reliably
624  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
625  // Cancel wait on previous parallel region...
626  __kmp_itt_task_starting(itt_sync_obj);
627 
628  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
629  return;
630 
631  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
632  if (itt_sync_obj != NULL)
633  // Call prepare as early as possible for "new" barrier
634  __kmp_itt_task_finished(itt_sync_obj);
635  } else
636 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
637  // Early exit for reaping threads releasing forkjoin barrier
638  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
639  return;
640 
641  // The worker thread may now assume that the team is valid.
642  team = __kmp_threads[gtid]->th.th_team;
643  KMP_DEBUG_ASSERT(team != NULL);
644  tid = __kmp_tid_from_gtid(gtid);
645 
646  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
647  KA_TRACE(20,
648  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
649  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
650  KMP_MB(); // Flush all pending memory write invalidates.
651  }
652  num_threads = this_thr->th.th_team_nproc;
653  other_threads = team->t.t_threads;
654 
655 #ifdef KMP_REVERSE_HYPER_BAR
656  // Count up to correct level for parent
657  for (level = 0, offset = 1;
658  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
659  level += branch_bits, offset <<= branch_bits)
660  ;
661 
662  // Now go down from there
663  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
664  level -= branch_bits, offset >>= branch_bits)
665 #else
666  // Go down the tree, level by level
667  for (level = 0, offset = 1; offset < num_threads;
668  level += branch_bits, offset <<= branch_bits)
669 #endif // KMP_REVERSE_HYPER_BAR
670  {
671 #ifdef KMP_REVERSE_HYPER_BAR
672  /* Now go in reverse order through the children, highest to lowest.
673  Initial setting of child is conservative here. */
674  child = num_threads >> ((level == 0) ? level : level - 1);
675  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
676  child_tid = tid + (child << level);
677  child >= 1; child--, child_tid -= (1 << level))
678 #else
679  if (((tid >> level) & (branch_factor - 1)) != 0)
680  // No need to go lower than this, since this is the level parent would be
681  // notified
682  break;
683  // Iterate through children on this level of the tree
684  for (child = 1, child_tid = tid + (1 << level);
685  child < branch_factor && child_tid < num_threads;
686  child++, child_tid += (1 << level))
687 #endif // KMP_REVERSE_HYPER_BAR
688  {
689  if (child_tid >= num_threads)
690  continue; // Child doesn't exist so keep going
691  else {
692  kmp_info_t *child_thr = other_threads[child_tid];
693  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
694 #if KMP_CACHE_MANAGE
695  kmp_uint32 next_child_tid = child_tid - (1 << level);
696 // Prefetch next thread's go count
697 #ifdef KMP_REVERSE_HYPER_BAR
698  if (child - 1 >= 1 && next_child_tid < num_threads)
699 #else
700  if (child + 1 < branch_factor && next_child_tid < num_threads)
701 #endif // KMP_REVERSE_HYPER_BAR
702  KMP_CACHE_PREFETCH(
703  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
704 #endif /* KMP_CACHE_MANAGE */
705 
706 #if KMP_BARRIER_ICV_PUSH
707  if (propagate_icvs) // push my fixed ICVs to my child
708  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
709 #endif // KMP_BARRIER_ICV_PUSH
710 
711  KA_TRACE(
712  20,
713  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
714  "go(%p): %u => %u\n",
715  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
716  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
717  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
718  // Release child from barrier
719  ANNOTATE_BARRIER_BEGIN(child_thr);
720  kmp_flag_64 flag(&child_bar->b_go, child_thr);
721  flag.release();
722  }
723  }
724  }
725 #if KMP_BARRIER_ICV_PUSH
726  if (propagate_icvs &&
727  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
728  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
729  FALSE);
730  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
731  &thr_bar->th_fixed_icvs);
732  }
733 #endif
734  KA_TRACE(
735  20,
736  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
737  gtid, team->t.t_id, tid, bt));
738 }
739 
740 // Hierarchical Barrier
741 
742 // Initialize thread barrier data
743 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
744  Performs the minimum amount of initialization required based on how the team
745  has changed. Returns true if leaf children will require both on-core and
746  traditional wake-up mechanisms. For example, if the team size increases,
747  threads already in the team will respond to on-core wakeup on their parent
748  thread, but threads newly added to the team will only be listening on the
749  their local b_go. */
750 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
751  kmp_bstate_t *thr_bar,
752  kmp_uint32 nproc, int gtid,
753  int tid, kmp_team_t *team) {
754  // Checks to determine if (re-)initialization is needed
755  bool uninitialized = thr_bar->team == NULL;
756  bool team_changed = team != thr_bar->team;
757  bool team_sz_changed = nproc != thr_bar->nproc;
758  bool tid_changed = tid != thr_bar->old_tid;
759  bool retval = false;
760 
761  if (uninitialized || team_sz_changed) {
762  __kmp_get_hierarchy(nproc, thr_bar);
763  }
764 
765  if (uninitialized || team_sz_changed || tid_changed) {
766  thr_bar->my_level = thr_bar->depth - 1; // default for master
767  thr_bar->parent_tid = -1; // default for master
768  if (!KMP_MASTER_TID(
769  tid)) { // if not master, find parent thread in hierarchy
770  kmp_uint32 d = 0;
771  while (d < thr_bar->depth) { // find parent based on level of thread in
772  // hierarchy, and note level
773  kmp_uint32 rem;
774  if (d == thr_bar->depth - 2) { // reached level right below the master
775  thr_bar->parent_tid = 0;
776  thr_bar->my_level = d;
777  break;
778  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
779  0) { // TODO: can we make this op faster?
780  // thread is not a subtree root at next level, so this is max
781  thr_bar->parent_tid = tid - rem;
782  thr_bar->my_level = d;
783  break;
784  }
785  ++d;
786  }
787  }
788  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
789  thr_bar->old_tid = tid;
790  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
791  thr_bar->team = team;
792  thr_bar->parent_bar =
793  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
794  }
795  if (uninitialized || team_changed || tid_changed) {
796  thr_bar->team = team;
797  thr_bar->parent_bar =
798  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
799  retval = true;
800  }
801  if (uninitialized || team_sz_changed || tid_changed) {
802  thr_bar->nproc = nproc;
803  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
804  if (thr_bar->my_level == 0)
805  thr_bar->leaf_kids = 0;
806  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
807  thr_bar->leaf_kids = nproc - tid - 1;
808  thr_bar->leaf_state = 0;
809  for (int i = 0; i < thr_bar->leaf_kids; ++i)
810  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
811  }
812  return retval;
813 }
814 
815 static void __kmp_hierarchical_barrier_gather(
816  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
817  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
818  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
819  kmp_team_t *team = this_thr->th.th_team;
820  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
821  kmp_uint32 nproc = this_thr->th.th_team_nproc;
822  kmp_info_t **other_threads = team->t.t_threads;
823  kmp_uint64 new_state;
824 
825  int level = team->t.t_level;
826 #if OMP_40_ENABLED
827  if (other_threads[0]
828  ->th.th_teams_microtask) // are we inside the teams construct?
829  if (this_thr->th.th_teams_size.nteams > 1)
830  ++level; // level was not increased in teams construct for team_of_masters
831 #endif
832  if (level == 1)
833  thr_bar->use_oncore_barrier = 1;
834  else
835  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
836 
837  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
838  "barrier type %d\n",
839  gtid, team->t.t_id, tid, bt));
840  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
841 
842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
843  // Barrier imbalance - save arrive time to the thread
844  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
845  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
846  }
847 #endif
848 
849  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
850  team);
851 
852  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
853  kmp_int32 child_tid;
854  new_state =
855  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
856  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
857  thr_bar->use_oncore_barrier) {
858  if (thr_bar->leaf_kids) {
859  // First, wait for leaf children to check-in on my b_arrived flag
860  kmp_uint64 leaf_state =
861  KMP_MASTER_TID(tid)
862  ? thr_bar->b_arrived | thr_bar->leaf_state
863  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
864  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
865  "for leaf kids\n",
866  gtid, team->t.t_id, tid));
867  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
868  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
869  if (reduce) {
870  ANNOTATE_REDUCE_AFTER(reduce);
871  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
872  ++child_tid) {
873  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
874  "T#%d(%d:%d)\n",
875  gtid, team->t.t_id, tid,
876  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
877  child_tid));
878  ANNOTATE_BARRIER_END(other_threads[child_tid]);
879  (*reduce)(this_thr->th.th_local.reduce_data,
880  other_threads[child_tid]->th.th_local.reduce_data);
881  }
882  ANNOTATE_REDUCE_BEFORE(reduce);
883  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
884  }
885  // clear leaf_state bits
886  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
887  }
888  // Next, wait for higher level children on each child's b_arrived flag
889  for (kmp_uint32 d = 1; d < thr_bar->my_level;
890  ++d) { // gather lowest level threads first, but skip 0
891  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
892  skip = thr_bar->skip_per_level[d];
893  if (last > nproc)
894  last = nproc;
895  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
896  kmp_info_t *child_thr = other_threads[child_tid];
897  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
898  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
899  "T#%d(%d:%d) "
900  "arrived(%p) == %llu\n",
901  gtid, team->t.t_id, tid,
902  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
903  child_tid, &child_bar->b_arrived, new_state));
904  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
905  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
906  ANNOTATE_BARRIER_END(child_thr);
907  if (reduce) {
908  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
909  "T#%d(%d:%d)\n",
910  gtid, team->t.t_id, tid,
911  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
912  child_tid));
913  ANNOTATE_REDUCE_AFTER(reduce);
914  (*reduce)(this_thr->th.th_local.reduce_data,
915  child_thr->th.th_local.reduce_data);
916  ANNOTATE_REDUCE_BEFORE(reduce);
917  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
918  }
919  }
920  }
921  } else { // Blocktime is not infinite
922  for (kmp_uint32 d = 0; d < thr_bar->my_level;
923  ++d) { // Gather lowest level threads first
924  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
925  skip = thr_bar->skip_per_level[d];
926  if (last > nproc)
927  last = nproc;
928  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
929  kmp_info_t *child_thr = other_threads[child_tid];
930  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
931  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
932  "T#%d(%d:%d) "
933  "arrived(%p) == %llu\n",
934  gtid, team->t.t_id, tid,
935  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
936  child_tid, &child_bar->b_arrived, new_state));
937  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
938  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
939  ANNOTATE_BARRIER_END(child_thr);
940  if (reduce) {
941  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
942  "T#%d(%d:%d)\n",
943  gtid, team->t.t_id, tid,
944  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
945  child_tid));
946  ANNOTATE_REDUCE_AFTER(reduce);
947  (*reduce)(this_thr->th.th_local.reduce_data,
948  child_thr->th.th_local.reduce_data);
949  ANNOTATE_REDUCE_BEFORE(reduce);
950  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
951  }
952  }
953  }
954  }
955  }
956  // All subordinates are gathered; now release parent if not master thread
957 
958  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
959  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
960  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
961  gtid, team->t.t_id, tid,
962  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
963  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
964  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
965  /* Mark arrival to parent: After performing this write, a worker thread may
966  not assume that the team is valid any more - it could be deallocated by
967  the master thread at any time. */
968  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
969  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
970  // flag; release it
971  ANNOTATE_BARRIER_BEGIN(this_thr);
972  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
973  flag.release();
974  } else {
975  // Leaf does special release on "offset" bits of parent's b_arrived flag
976  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
977  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
978  flag.set_waiter(other_threads[thr_bar->parent_tid]);
979  flag.release();
980  }
981  } else { // Master thread needs to update the team's b_arrived value
982  team->t.t_bar[bt].b_arrived = new_state;
983  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
984  "arrived(%p) = %llu\n",
985  gtid, team->t.t_id, tid, team->t.t_id,
986  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
987  }
988  // Is the team access below unsafe or just technically invalid?
989  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
990  "barrier type %d\n",
991  gtid, team->t.t_id, tid, bt));
992 }
993 
994 static void __kmp_hierarchical_barrier_release(
995  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
996  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
997  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
998  kmp_team_t *team;
999  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1000  kmp_uint32 nproc;
1001  bool team_change = false; // indicates on-core barrier shouldn't be used
1002 
1003  if (KMP_MASTER_TID(tid)) {
1004  team = __kmp_threads[gtid]->th.th_team;
1005  KMP_DEBUG_ASSERT(team != NULL);
1006  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1007  "entered barrier type %d\n",
1008  gtid, team->t.t_id, tid, bt));
1009  } else { // Worker threads
1010  // Wait for parent thread to release me
1011  if (!thr_bar->use_oncore_barrier ||
1012  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1013  thr_bar->team == NULL) {
1014  // Use traditional method of waiting on my own b_go flag
1015  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1016  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1017  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1018  ANNOTATE_BARRIER_END(this_thr);
1019  TCW_8(thr_bar->b_go,
1020  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1021  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1022  // infinite, not nested
1023  // Wait on my "offset" bits on parent's b_go flag
1024  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1025  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1026  thr_bar->offset, bt,
1027  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1028  flag.wait(this_thr, TRUE);
1029  if (thr_bar->wait_flag ==
1030  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1031  TCW_8(thr_bar->b_go,
1032  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1033  } else { // Reset my bits on parent's b_go flag
1034  (RCAST(volatile char *,
1035  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1036  }
1037  }
1038  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1039  // Early exit for reaping threads releasing forkjoin barrier
1040  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1041  return;
1042  // The worker thread may now assume that the team is valid.
1043  team = __kmp_threads[gtid]->th.th_team;
1044  KMP_DEBUG_ASSERT(team != NULL);
1045  tid = __kmp_tid_from_gtid(gtid);
1046 
1047  KA_TRACE(
1048  20,
1049  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1050  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1051  KMP_MB(); // Flush all pending memory write invalidates.
1052  }
1053 
1054  nproc = this_thr->th.th_team_nproc;
1055  int level = team->t.t_level;
1056 #if OMP_40_ENABLED
1057  if (team->t.t_threads[0]
1058  ->th.th_teams_microtask) { // are we inside the teams construct?
1059  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1060  this_thr->th.th_teams_level == level)
1061  ++level; // level was not increased in teams construct for team_of_workers
1062  if (this_thr->th.th_teams_size.nteams > 1)
1063  ++level; // level was not increased in teams construct for team_of_masters
1064  }
1065 #endif
1066  if (level == 1)
1067  thr_bar->use_oncore_barrier = 1;
1068  else
1069  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1070 
1071  // If the team size has increased, we still communicate with old leaves via
1072  // oncore barrier.
1073  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1074  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1075  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1076  tid, team);
1077  // But if the entire team changes, we won't use oncore barrier at all
1078  if (team_change)
1079  old_leaf_kids = 0;
1080 
1081 #if KMP_BARRIER_ICV_PUSH
1082  if (propagate_icvs) {
1083  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1084  FALSE);
1085  if (KMP_MASTER_TID(
1086  tid)) { // master already has copy in final destination; copy
1087  copy_icvs(&thr_bar->th_fixed_icvs,
1088  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1089  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1090  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1091  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1092  // leaves (on-core children) pull parent's fixed ICVs directly to local
1093  // ICV store
1094  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1095  &thr_bar->parent_bar->th_fixed_icvs);
1096  // non-leaves will get ICVs piggybacked with b_go via NGO store
1097  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1098  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1099  // access
1100  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1101  else // leaves copy parent's fixed ICVs directly to local ICV store
1102  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1103  &thr_bar->parent_bar->th_fixed_icvs);
1104  }
1105  }
1106 #endif // KMP_BARRIER_ICV_PUSH
1107 
1108  // Now, release my children
1109  if (thr_bar->my_level) { // not a leaf
1110  kmp_int32 child_tid;
1111  kmp_uint32 last;
1112  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1113  thr_bar->use_oncore_barrier) {
1114  if (KMP_MASTER_TID(tid)) { // do a flat release
1115  // Set local b_go to bump children via NGO store of the cache line
1116  // containing IVCs and b_go.
1117  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1118  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1119  // the cache line
1120  ngo_load(&thr_bar->th_fixed_icvs);
1121  // This loops over all the threads skipping only the leaf nodes in the
1122  // hierarchy
1123  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1124  child_tid += thr_bar->skip_per_level[1]) {
1125  kmp_bstate_t *child_bar =
1126  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1127  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1128  "releasing T#%d(%d:%d)"
1129  " go(%p): %u => %u\n",
1130  gtid, team->t.t_id, tid,
1131  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1132  child_tid, &child_bar->b_go, child_bar->b_go,
1133  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1134  // Use ngo store (if available) to both store ICVs and release child
1135  // via child's b_go
1136  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1137  }
1138  ngo_sync();
1139  }
1140  TCW_8(thr_bar->b_go,
1141  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1142  // Now, release leaf children
1143  if (thr_bar->leaf_kids) { // if there are any
1144  // We test team_change on the off-chance that the level 1 team changed.
1145  if (team_change ||
1146  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1147  if (old_leaf_kids) { // release old leaf kids
1148  thr_bar->b_go |= old_leaf_state;
1149  }
1150  // Release new leaf kids
1151  last = tid + thr_bar->skip_per_level[1];
1152  if (last > nproc)
1153  last = nproc;
1154  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1155  ++child_tid) { // skip_per_level[0]=1
1156  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1157  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1158  KA_TRACE(
1159  20,
1160  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1161  " T#%d(%d:%d) go(%p): %u => %u\n",
1162  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1163  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1164  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1165  // Release child using child's b_go flag
1166  ANNOTATE_BARRIER_BEGIN(child_thr);
1167  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1168  flag.release();
1169  }
1170  } else { // Release all children at once with leaf_state bits on my own
1171  // b_go flag
1172  thr_bar->b_go |= thr_bar->leaf_state;
1173  }
1174  }
1175  } else { // Blocktime is not infinite; do a simple hierarchical release
1176  for (int d = thr_bar->my_level - 1; d >= 0;
1177  --d) { // Release highest level threads first
1178  last = tid + thr_bar->skip_per_level[d + 1];
1179  kmp_uint32 skip = thr_bar->skip_per_level[d];
1180  if (last > nproc)
1181  last = nproc;
1182  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1183  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1184  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1185  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1186  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1187  gtid, team->t.t_id, tid,
1188  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1189  child_tid, &child_bar->b_go, child_bar->b_go,
1190  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1191  // Release child using child's b_go flag
1192  ANNOTATE_BARRIER_BEGIN(child_thr);
1193  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1194  flag.release();
1195  }
1196  }
1197  }
1198 #if KMP_BARRIER_ICV_PUSH
1199  if (propagate_icvs && !KMP_MASTER_TID(tid))
1200  // non-leaves copy ICVs from fixed ICVs to local dest
1201  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1202  &thr_bar->th_fixed_icvs);
1203 #endif // KMP_BARRIER_ICV_PUSH
1204  }
1205  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1206  "barrier type %d\n",
1207  gtid, team->t.t_id, tid, bt));
1208 }
1209 
1210 // End of Barrier Algorithms
1211 
1212 // Internal function to do a barrier.
1213 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1214  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1215  barrier
1216  Returns 0 if master thread, 1 if worker thread. */
1217 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1218  size_t reduce_size, void *reduce_data,
1219  void (*reduce)(void *, void *)) {
1220  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1221  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1222  int tid = __kmp_tid_from_gtid(gtid);
1223  kmp_info_t *this_thr = __kmp_threads[gtid];
1224  kmp_team_t *team = this_thr->th.th_team;
1225  int status = 0;
1226  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1227 #if OMPT_SUPPORT
1228  ompt_data_t *my_task_data;
1229  ompt_data_t *my_parallel_data;
1230  void *return_address;
1231 #endif
1232 
1233  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1234  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1235 
1236  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1237 #if OMPT_SUPPORT
1238  if (ompt_enabled.enabled) {
1239 #if OMPT_OPTIONAL
1240  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1241  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1242  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1243  if (ompt_enabled.ompt_callback_sync_region) {
1244  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1245  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1246  my_task_data, return_address);
1247  }
1248  if (ompt_enabled.ompt_callback_sync_region_wait) {
1249  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1250  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1251  my_task_data, return_address);
1252  }
1253 #endif
1254  // It is OK to report the barrier state after the barrier begin callback.
1255  // According to the OMPT specification, a compliant implementation may
1256  // even delay reporting this state until the barrier begins to wait.
1257  this_thr->th.ompt_thread_info.state = omp_state_wait_barrier;
1258  }
1259 #endif
1260 
1261  if (!team->t.t_serialized) {
1262 #if USE_ITT_BUILD
1263  // This value will be used in itt notify events below.
1264  void *itt_sync_obj = NULL;
1265 #if USE_ITT_NOTIFY
1266  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1267  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1268 #endif
1269 #endif /* USE_ITT_BUILD */
1270  if (__kmp_tasking_mode == tskm_extra_barrier) {
1271  __kmp_tasking_barrier(team, this_thr, gtid);
1272  KA_TRACE(15,
1273  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1274  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1275  }
1276 
1277  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1278  access it when the team struct is not guaranteed to exist. */
1279  // See note about the corresponding code in __kmp_join_barrier() being
1280  // performance-critical.
1281  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1282 #if KMP_USE_MONITOR
1283  this_thr->th.th_team_bt_intervals =
1284  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1285  this_thr->th.th_team_bt_set =
1286  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1287 #else
1288  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1289 #endif
1290  }
1291 
1292 #if USE_ITT_BUILD
1293  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1294  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1295 #endif /* USE_ITT_BUILD */
1296 #if USE_DEBUGGER
1297  // Let the debugger know: the thread arrived to the barrier and waiting.
1298  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1299  team->t.t_bar[bt].b_master_arrived += 1;
1300  } else {
1301  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1302  } // if
1303 #endif /* USE_DEBUGGER */
1304  if (reduce != NULL) {
1305  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1306  this_thr->th.th_local.reduce_data = reduce_data;
1307  }
1308 
1309  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1310  __kmp_task_team_setup(
1311  this_thr, team,
1312  0); // use 0 to only setup the current team if nthreads > 1
1313 
1314  switch (__kmp_barrier_gather_pattern[bt]) {
1315  case bp_hyper_bar: {
1316  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1317  // to 0; use linear
1318  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1319  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1320  break;
1321  }
1322  case bp_hierarchical_bar: {
1323  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid,
1324  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1325  break;
1326  }
1327  case bp_tree_bar: {
1328  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1329  // to 0; use linear
1330  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1331  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1332  break;
1333  }
1334  default: {
1335  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1336  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1337  }
1338  }
1339 
1340  KMP_MB();
1341 
1342  if (KMP_MASTER_TID(tid)) {
1343  status = 0;
1344  if (__kmp_tasking_mode != tskm_immediate_exec) {
1345  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1346  }
1347 #if USE_DEBUGGER
1348  // Let the debugger know: All threads are arrived and starting leaving the
1349  // barrier.
1350  team->t.t_bar[bt].b_team_arrived += 1;
1351 #endif
1352 
1353 #if OMP_40_ENABLED
1354  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1355  // Reset cancellation flag for worksharing constructs
1356  if (cancel_request == cancel_loop || cancel_request == cancel_sections) {
1357  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1358  }
1359 #endif
1360 #if USE_ITT_BUILD
1361  /* TODO: In case of split reduction barrier, master thread may send
1362  acquired event early, before the final summation into the shared
1363  variable is done (final summation can be a long operation for array
1364  reductions). */
1365  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1366  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1367 #endif /* USE_ITT_BUILD */
1368 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1369  // Barrier - report frame end (only if active_level == 1)
1370  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1371  __kmp_forkjoin_frames_mode &&
1372 #if OMP_40_ENABLED
1373  this_thr->th.th_teams_microtask == NULL &&
1374 #endif
1375  team->t.t_active_level == 1) {
1376  kmp_uint64 cur_time = __itt_get_timestamp();
1377  kmp_info_t **other_threads = team->t.t_threads;
1378  int nproc = this_thr->th.th_team_nproc;
1379  int i;
1380  switch (__kmp_forkjoin_frames_mode) {
1381  case 1:
1382  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1383  loc, nproc);
1384  this_thr->th.th_frame_time = cur_time;
1385  break;
1386  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1387  // be fixed)
1388  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1389  1, loc, nproc);
1390  break;
1391  case 3:
1392  if (__itt_metadata_add_ptr) {
1393  // Initialize with master's wait time
1394  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1395  // Set arrive time to zero to be able to check it in
1396  // __kmp_invoke_task(); the same is done inside the loop below
1397  this_thr->th.th_bar_arrive_time = 0;
1398  for (i = 1; i < nproc; ++i) {
1399  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1400  other_threads[i]->th.th_bar_arrive_time = 0;
1401  }
1402  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1403  cur_time, delta,
1404  (kmp_uint64)(reduce != NULL));
1405  }
1406  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1407  loc, nproc);
1408  this_thr->th.th_frame_time = cur_time;
1409  break;
1410  }
1411  }
1412 #endif /* USE_ITT_BUILD */
1413  } else {
1414  status = 1;
1415 #if USE_ITT_BUILD
1416  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1417  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1418 #endif /* USE_ITT_BUILD */
1419  }
1420  if (status == 1 || !is_split) {
1421  switch (__kmp_barrier_release_pattern[bt]) {
1422  case bp_hyper_bar: {
1423  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1424  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1425  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1426  break;
1427  }
1428  case bp_hierarchical_bar: {
1429  __kmp_hierarchical_barrier_release(
1430  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1431  break;
1432  }
1433  case bp_tree_bar: {
1434  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1435  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1436  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1437  break;
1438  }
1439  default: {
1440  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1441  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1442  }
1443  }
1444  if (__kmp_tasking_mode != tskm_immediate_exec) {
1445  __kmp_task_team_sync(this_thr, team);
1446  }
1447  }
1448 
1449 #if USE_ITT_BUILD
1450  /* GEH: TODO: Move this under if-condition above and also include in
1451  __kmp_end_split_barrier(). This will more accurately represent the actual
1452  release time of the threads for split barriers. */
1453  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1454  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1455 #endif /* USE_ITT_BUILD */
1456  } else { // Team is serialized.
1457  status = 0;
1458  if (__kmp_tasking_mode != tskm_immediate_exec) {
1459 #if OMP_45_ENABLED
1460  if (this_thr->th.th_task_team != NULL) {
1461  void *itt_sync_obj = NULL;
1462 #if USE_ITT_NOTIFY
1463  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1464  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1465  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1466  }
1467 #endif
1468 
1469  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1470  TRUE);
1471  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1472  __kmp_task_team_setup(this_thr, team, 0);
1473 
1474 #if USE_ITT_BUILD
1475  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1476  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1477 #endif /* USE_ITT_BUILD */
1478  }
1479 #else
1480  // The task team should be NULL for serialized code (tasks will be
1481  // executed immediately)
1482  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1483  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1484 #endif
1485  }
1486  }
1487  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1488  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1489  __kmp_tid_from_gtid(gtid), status));
1490 
1491 #if OMPT_SUPPORT
1492  if (ompt_enabled.enabled) {
1493 #if OMPT_OPTIONAL
1494  if (ompt_enabled.ompt_callback_sync_region_wait) {
1495  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1496  ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1497  my_task_data, return_address);
1498  }
1499  if (ompt_enabled.ompt_callback_sync_region) {
1500  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1501  ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1502  my_task_data, return_address);
1503  }
1504 #endif
1505  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1506  }
1507 #endif
1508  ANNOTATE_BARRIER_END(&team->t.t_bar);
1509 
1510  return status;
1511 }
1512 
1513 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1514  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1515  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1516  int tid = __kmp_tid_from_gtid(gtid);
1517  kmp_info_t *this_thr = __kmp_threads[gtid];
1518  kmp_team_t *team = this_thr->th.th_team;
1519 
1520  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1521  if (!team->t.t_serialized) {
1522  if (KMP_MASTER_GTID(gtid)) {
1523  switch (__kmp_barrier_release_pattern[bt]) {
1524  case bp_hyper_bar: {
1525  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1526  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1527  FALSE USE_ITT_BUILD_ARG(NULL));
1528  break;
1529  }
1530  case bp_hierarchical_bar: {
1531  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1532  FALSE USE_ITT_BUILD_ARG(NULL));
1533  break;
1534  }
1535  case bp_tree_bar: {
1536  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1537  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1538  FALSE USE_ITT_BUILD_ARG(NULL));
1539  break;
1540  }
1541  default: {
1542  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1543  FALSE USE_ITT_BUILD_ARG(NULL));
1544  }
1545  }
1546  if (__kmp_tasking_mode != tskm_immediate_exec) {
1547  __kmp_task_team_sync(this_thr, team);
1548  } // if
1549  }
1550  }
1551  ANNOTATE_BARRIER_END(&team->t.t_bar);
1552 }
1553 
1554 void __kmp_join_barrier(int gtid) {
1555  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1556  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1557  kmp_info_t *this_thr = __kmp_threads[gtid];
1558  kmp_team_t *team;
1559  kmp_uint nproc;
1560  kmp_info_t *master_thread;
1561  int tid;
1562 #ifdef KMP_DEBUG
1563  int team_id;
1564 #endif /* KMP_DEBUG */
1565 #if USE_ITT_BUILD
1566  void *itt_sync_obj = NULL;
1567 #if USE_ITT_NOTIFY
1568  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1569  // Get object created at fork_barrier
1570  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1571 #endif
1572 #endif /* USE_ITT_BUILD */
1573  KMP_MB();
1574 
1575  // Get current info
1576  team = this_thr->th.th_team;
1577  nproc = this_thr->th.th_team_nproc;
1578  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1579  tid = __kmp_tid_from_gtid(gtid);
1580 #ifdef KMP_DEBUG
1581  team_id = team->t.t_id;
1582 #endif /* KMP_DEBUG */
1583  master_thread = this_thr->th.th_team_master;
1584 #ifdef KMP_DEBUG
1585  if (master_thread != team->t.t_threads[0]) {
1586  __kmp_print_structure();
1587  }
1588 #endif /* KMP_DEBUG */
1589  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1590  KMP_MB();
1591 
1592  // Verify state
1593  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1594  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1595  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1596  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1597  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1598  gtid, team_id, tid));
1599 
1600  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1601 #if OMPT_SUPPORT
1602  ompt_data_t *my_task_data;
1603  ompt_data_t *my_parallel_data;
1604  if (ompt_enabled.enabled) {
1605 #if OMPT_OPTIONAL
1606  void *codeptr = NULL;
1607  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1608  if (KMP_MASTER_TID(ds_tid) &&
1609  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1610  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1611  codeptr = team->t.ompt_team_info.master_return_address;
1612  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1613  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1614  if (ompt_enabled.ompt_callback_sync_region) {
1615  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1616  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1617  my_task_data, codeptr);
1618  }
1619  if (ompt_enabled.ompt_callback_sync_region_wait) {
1620  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1621  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1622  my_task_data, codeptr);
1623  }
1624  if (!KMP_MASTER_TID(ds_tid))
1625  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1626 #endif
1627  this_thr->th.ompt_thread_info.state = omp_state_wait_barrier_implicit;
1628  }
1629 #endif
1630 
1631  if (__kmp_tasking_mode == tskm_extra_barrier) {
1632  __kmp_tasking_barrier(team, this_thr, gtid);
1633  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1634  team_id, tid));
1635  }
1636 #ifdef KMP_DEBUG
1637  if (__kmp_tasking_mode != tskm_immediate_exec) {
1638  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1639  "%p, th_task_team = %p\n",
1640  __kmp_gtid_from_thread(this_thr), team_id,
1641  team->t.t_task_team[this_thr->th.th_task_state],
1642  this_thr->th.th_task_team));
1643  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1644  team->t.t_task_team[this_thr->th.th_task_state]);
1645  }
1646 #endif /* KMP_DEBUG */
1647 
1648  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1649  access it when the team struct is not guaranteed to exist. Doing these
1650  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1651  we do not perform the copy if blocktime=infinite, since the values are not
1652  used by __kmp_wait_template() in that case. */
1653  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1654 #if KMP_USE_MONITOR
1655  this_thr->th.th_team_bt_intervals =
1656  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1657  this_thr->th.th_team_bt_set =
1658  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1659 #else
1660  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1661 #endif
1662  }
1663 
1664 #if USE_ITT_BUILD
1665  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1666  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1667 #endif /* USE_ITT_BUILD */
1668 
1669  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1670  case bp_hyper_bar: {
1671  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1672  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1673  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1674  break;
1675  }
1676  case bp_hierarchical_bar: {
1677  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1678  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1679  break;
1680  }
1681  case bp_tree_bar: {
1682  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1683  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1684  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1685  break;
1686  }
1687  default: {
1688  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1689  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1690  }
1691  }
1692 
1693  /* From this point on, the team data structure may be deallocated at any time
1694  by the master thread - it is unsafe to reference it in any of the worker
1695  threads. Any per-team data items that need to be referenced before the
1696  end of the barrier should be moved to the kmp_task_team_t structs. */
1697  if (KMP_MASTER_TID(tid)) {
1698  if (__kmp_tasking_mode != tskm_immediate_exec) {
1699  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1700  }
1701 #if KMP_STATS_ENABLED
1702  // Have master thread flag the workers to indicate they are now waiting for
1703  // next parallel region, Also wake them up so they switch their timers to
1704  // idle.
1705  for (int i = 0; i < team->t.t_nproc; ++i) {
1706  kmp_info_t *team_thread = team->t.t_threads[i];
1707  if (team_thread == this_thr)
1708  continue;
1709  team_thread->th.th_stats->setIdleFlag();
1710  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1711  team_thread->th.th_sleep_loc != NULL)
1712  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1713  team_thread->th.th_sleep_loc);
1714  }
1715 #endif
1716 #if USE_ITT_BUILD
1717  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1718  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1719 #endif /* USE_ITT_BUILD */
1720 
1721 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1722  // Join barrier - report frame end
1723  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1724  __kmp_forkjoin_frames_mode &&
1725 #if OMP_40_ENABLED
1726  this_thr->th.th_teams_microtask == NULL &&
1727 #endif
1728  team->t.t_active_level == 1) {
1729  kmp_uint64 cur_time = __itt_get_timestamp();
1730  ident_t *loc = team->t.t_ident;
1731  kmp_info_t **other_threads = team->t.t_threads;
1732  int nproc = this_thr->th.th_team_nproc;
1733  int i;
1734  switch (__kmp_forkjoin_frames_mode) {
1735  case 1:
1736  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1737  loc, nproc);
1738  break;
1739  case 2:
1740  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1741  loc, nproc);
1742  break;
1743  case 3:
1744  if (__itt_metadata_add_ptr) {
1745  // Initialize with master's wait time
1746  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1747  // Set arrive time to zero to be able to check it in
1748  // __kmp_invoke_task(); the same is done inside the loop below
1749  this_thr->th.th_bar_arrive_time = 0;
1750  for (i = 1; i < nproc; ++i) {
1751  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1752  other_threads[i]->th.th_bar_arrive_time = 0;
1753  }
1754  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1755  cur_time, delta, 0);
1756  }
1757  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1758  loc, nproc);
1759  this_thr->th.th_frame_time = cur_time;
1760  break;
1761  }
1762  }
1763 #endif /* USE_ITT_BUILD */
1764  }
1765 #if USE_ITT_BUILD
1766  else {
1767  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1768  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1769  }
1770 #endif /* USE_ITT_BUILD */
1771 
1772 #if KMP_DEBUG
1773  if (KMP_MASTER_TID(tid)) {
1774  KA_TRACE(
1775  15,
1776  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1777  gtid, team_id, tid, nproc));
1778  }
1779 #endif /* KMP_DEBUG */
1780 
1781  // TODO now, mark worker threads as done so they may be disbanded
1782  KMP_MB(); // Flush all pending memory write invalidates.
1783  KA_TRACE(10,
1784  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1785 
1786  ANNOTATE_BARRIER_END(&team->t.t_bar);
1787 }
1788 
1789 // TODO release worker threads' fork barriers as we are ready instead of all at
1790 // once
1791 void __kmp_fork_barrier(int gtid, int tid) {
1792  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1793  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1794  kmp_info_t *this_thr = __kmp_threads[gtid];
1795  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1796 #if USE_ITT_BUILD
1797  void *itt_sync_obj = NULL;
1798 #endif /* USE_ITT_BUILD */
1799  if (team)
1800  ANNOTATE_BARRIER_END(&team->t.t_bar);
1801 
1802  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1803  (team != NULL) ? team->t.t_id : -1, tid));
1804 
1805  // th_team pointer only valid for master thread here
1806  if (KMP_MASTER_TID(tid)) {
1807 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1808  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1809  // Create itt barrier object
1810  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1811  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1812  }
1813 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1814 
1815 #ifdef KMP_DEBUG
1816  kmp_info_t **other_threads = team->t.t_threads;
1817  int i;
1818 
1819  // Verify state
1820  KMP_MB();
1821 
1822  for (i = 1; i < team->t.t_nproc; ++i) {
1823  KA_TRACE(500,
1824  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1825  "== %u.\n",
1826  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1827  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1828  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1829  KMP_DEBUG_ASSERT(
1830  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1831  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1832  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1833  }
1834 #endif
1835 
1836  if (__kmp_tasking_mode != tskm_immediate_exec) {
1837  // 0 indicates setup current task team if nthreads > 1
1838  __kmp_task_team_setup(this_thr, team, 0);
1839  }
1840 
1841  /* The master thread may have changed its blocktime between the join barrier
1842  and the fork barrier. Copy the blocktime info to the thread, where
1843  __kmp_wait_template() can access it when the team struct is not
1844  guaranteed to exist. */
1845  // See note about the corresponding code in __kmp_join_barrier() being
1846  // performance-critical
1847  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1848 #if KMP_USE_MONITOR
1849  this_thr->th.th_team_bt_intervals =
1850  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1851  this_thr->th.th_team_bt_set =
1852  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1853 #else
1854  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1855 #endif
1856  }
1857  } // master
1858 
1859  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1860  case bp_hyper_bar: {
1861  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1862  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1863  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1864  break;
1865  }
1866  case bp_hierarchical_bar: {
1867  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1868  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1869  break;
1870  }
1871  case bp_tree_bar: {
1872  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1873  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1874  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1875  break;
1876  }
1877  default: {
1878  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1879  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1880  }
1881  }
1882 
1883 #if OMPT_SUPPORT
1884  if (ompt_enabled.enabled &&
1885  this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
1886  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1887  ompt_data_t *task_data = (team)
1888  ? OMPT_CUR_TASK_DATA(this_thr)
1889  : &(this_thr->th.ompt_thread_info.task_data);
1890  this_thr->th.ompt_thread_info.state = omp_state_overhead;
1891 #if OMPT_OPTIONAL
1892  void *codeptr = NULL;
1893  if (KMP_MASTER_TID(ds_tid) &&
1894  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1895  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1896  codeptr = team->t.ompt_team_info.master_return_address;
1897  if (ompt_enabled.ompt_callback_sync_region_wait) {
1898  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1899  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
1900  }
1901  if (ompt_enabled.ompt_callback_sync_region) {
1902  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1903  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
1904  }
1905 #endif
1906  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
1907  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1908  ompt_scope_end, NULL, task_data, 0, ds_tid);
1909  }
1910  }
1911 #endif
1912 
1913  // Early exit for reaping threads releasing forkjoin barrier
1914  if (TCR_4(__kmp_global.g.g_done)) {
1915  this_thr->th.th_task_team = NULL;
1916 
1917 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1918  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1919  if (!KMP_MASTER_TID(tid)) {
1920  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1921  if (itt_sync_obj)
1922  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1923  }
1924  }
1925 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1926  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1927  return;
1928  }
1929 
1930  /* We can now assume that a valid team structure has been allocated by the
1931  master and propagated to all worker threads. The current thread, however,
1932  may not be part of the team, so we can't blindly assume that the team
1933  pointer is non-null. */
1934  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1935  KMP_DEBUG_ASSERT(team != NULL);
1936  tid = __kmp_tid_from_gtid(gtid);
1937 
1938 #if KMP_BARRIER_ICV_PULL
1939  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1940  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
1941  implicit task has this data before this function is called. We cannot
1942  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
1943  struct, because it is not always the case that the threads arrays have
1944  been allocated when __kmp_fork_call() is executed. */
1945  {
1946  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1947  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1948  // Copy the initial ICVs from the master's thread struct to the implicit
1949  // task for this tid.
1950  KA_TRACE(10,
1951  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1952  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
1953  tid, FALSE);
1954  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1955  &team->t.t_threads[0]
1956  ->th.th_bar[bs_forkjoin_barrier]
1957  .bb.th_fixed_icvs);
1958  }
1959  }
1960 #endif // KMP_BARRIER_ICV_PULL
1961 
1962  if (__kmp_tasking_mode != tskm_immediate_exec) {
1963  __kmp_task_team_sync(this_thr, team);
1964  }
1965 
1966 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1967  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1968  if (proc_bind == proc_bind_intel) {
1969 #endif
1970 #if KMP_AFFINITY_SUPPORTED
1971  // Call dynamic affinity settings
1972  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1973  __kmp_balanced_affinity(tid, team->t.t_nproc);
1974  }
1975 #endif // KMP_AFFINITY_SUPPORTED
1976 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1977  } else if (proc_bind != proc_bind_false) {
1978  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1979  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1980  __kmp_gtid_from_thread(this_thr),
1981  this_thr->th.th_current_place));
1982  } else {
1983  __kmp_affinity_set_place(gtid);
1984  }
1985  }
1986 #endif
1987 
1988 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1989  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1990  if (!KMP_MASTER_TID(tid)) {
1991  // Get correct barrier object
1992  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1993  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1994  } // (prepare called inside barrier_release)
1995  }
1996 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1997  ANNOTATE_BARRIER_END(&team->t.t_bar);
1998  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
1999  team->t.t_id, tid));
2000 }
2001 
2002 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2003  kmp_internal_control_t *new_icvs, ident_t *loc) {
2004  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2005 
2006  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2007  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2008 
2009 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2010  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2011  implicit task has this data before this function is called. */
2012 #if KMP_BARRIER_ICV_PULL
2013  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2014  untouched), where all of the worker threads can access them and make their
2015  own copies after the barrier. */
2016  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2017  // allocated at this point
2018  copy_icvs(
2019  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2020  new_icvs);
2021  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2022  team->t.t_threads[0], team));
2023 #elif KMP_BARRIER_ICV_PUSH
2024  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2025  // done here.
2026  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2027  team->t.t_threads[0], team));
2028 #else
2029  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2030  // time.
2031  ngo_load(new_icvs);
2032  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2033  // allocated at this point
2034  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2035  // TODO: GEH - pass in better source location info since usually NULL here
2036  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2037  f, team->t.t_threads[f], team));
2038  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2039  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2040  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2041  f, team->t.t_threads[f], team));
2042  }
2043  ngo_sync();
2044 #endif // KMP_BARRIER_ICV_PULL
2045 }
Definition: kmp.h:207