LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_wait_release.h"
16 #include "kmp_itt.h"
17 #include "kmp_os.h"
18 #include "kmp_stats.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #include "tsan_annotations.h"
29 
30 #if KMP_MIC && USE_NGO_STORES
31 // ICV copying
32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
36 #else
37 #define ngo_load(src) ((void)0)
38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
40 #define ngo_sync() ((void)0)
41 #endif /* KMP_MIC && USE_NGO_STORES */
42 
43 void __kmp_print_structure(void); // Forward declaration
44 
45 // ---------------------------- Barrier Algorithms ----------------------------
46 
47 // Linear Barrier
48 static void __kmp_linear_barrier_gather(
49  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52  kmp_team_t *team = this_thr->th.th_team;
53  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54  kmp_info_t **other_threads = team->t.t_threads;
55 
56  KA_TRACE(
57  20,
58  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59  gtid, team->t.t_id, tid, bt));
60  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63  // Barrier imbalance - save arrive time to the thread
64  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66  __itt_get_timestamp();
67  }
68 #endif
69  // We now perform a linear reduction to signal that all of the threads have
70  // arrived.
71  if (!KMP_MASTER_TID(tid)) {
72  KA_TRACE(20,
73  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74  "arrived(%p): %llu => %llu\n",
75  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78  // Mark arrival to master thread
79  /* After performing this write, a worker thread may not assume that the team
80  is valid any more - it could be deallocated by the master thread at any
81  time. */
82  ANNOTATE_BARRIER_BEGIN(this_thr);
83  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84  flag.release();
85  } else {
86  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87  int nproc = this_thr->th.th_team_nproc;
88  int i;
89  // Don't have to worry about sleep bit here or atomic since team setting
90  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 
92  // Collect all the worker team member threads.
93  for (i = 1; i < nproc; ++i) {
94 #if KMP_CACHE_MANAGE
95  // Prefetch next thread's arrived count
96  if (i + 1 < nproc)
97  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
98 #endif /* KMP_CACHE_MANAGE */
99  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100  "arrived(%p) == %llu\n",
101  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
102  team->t.t_id, i,
103  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
104 
105  // Wait for worker thread to arrive
106  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
107  new_state);
108  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109  ANNOTATE_BARRIER_END(other_threads[i]);
110 #if USE_ITT_BUILD && USE_ITT_NOTIFY
111  // Barrier imbalance - write min of the thread time and the other thread
112  // time to the thread.
113  if (__kmp_forkjoin_frames_mode == 2) {
114  this_thr->th.th_bar_min_time = KMP_MIN(
115  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
116  }
117 #endif
118  if (reduce) {
119  KA_TRACE(100,
120  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
121  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
122  team->t.t_id, i));
123  ANNOTATE_REDUCE_AFTER(reduce);
124  (*reduce)(this_thr->th.th_local.reduce_data,
125  other_threads[i]->th.th_local.reduce_data);
126  ANNOTATE_REDUCE_BEFORE(reduce);
127  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
128  }
129  }
130  // Don't have to worry about sleep bit here or atomic since team setting
131  team_bar->b_arrived = new_state;
132  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
133  "arrived(%p) = %llu\n",
134  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
135  new_state));
136  }
137  KA_TRACE(
138  20,
139  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
140  gtid, team->t.t_id, tid, bt));
141 }
142 
143 static void __kmp_linear_barrier_release(
144  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
145  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
146  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
147  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
148  kmp_team_t *team;
149 
150  if (KMP_MASTER_TID(tid)) {
151  unsigned int i;
152  kmp_uint32 nproc = this_thr->th.th_team_nproc;
153  kmp_info_t **other_threads;
154 
155  team = __kmp_threads[gtid]->th.th_team;
156  KMP_DEBUG_ASSERT(team != NULL);
157  other_threads = team->t.t_threads;
158 
159  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
160  "barrier type %d\n",
161  gtid, team->t.t_id, tid, bt));
162 
163  if (nproc > 1) {
164 #if KMP_BARRIER_ICV_PUSH
165  {
166  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
167  if (propagate_icvs) {
168  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
169  for (i = 1; i < nproc; ++i) {
170  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
171  team, i, FALSE);
172  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
173  &team->t.t_implicit_task_taskdata[0].td_icvs);
174  }
175  ngo_sync();
176  }
177  }
178 #endif // KMP_BARRIER_ICV_PUSH
179 
180  // Now, release all of the worker threads
181  for (i = 1; i < nproc; ++i) {
182 #if KMP_CACHE_MANAGE
183  // Prefetch next thread's go flag
184  if (i + 1 < nproc)
185  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
186 #endif /* KMP_CACHE_MANAGE */
187  KA_TRACE(
188  20,
189  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
190  "go(%p): %u => %u\n",
191  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
192  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
193  other_threads[i]->th.th_bar[bt].bb.b_go,
194  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
195  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
196  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
197  other_threads[i]);
198  flag.release();
199  }
200  }
201  } else { // Wait for the MASTER thread to release us
202  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
203  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
204  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
205  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
206  ANNOTATE_BARRIER_END(this_thr);
207 #if USE_ITT_BUILD && USE_ITT_NOTIFY
208  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
209  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
210  // disabled)
211  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
212  // Cancel wait on previous parallel region...
213  __kmp_itt_task_starting(itt_sync_obj);
214 
215  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
216  return;
217 
218  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
219  if (itt_sync_obj != NULL)
220  // Call prepare as early as possible for "new" barrier
221  __kmp_itt_task_finished(itt_sync_obj);
222  } else
223 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
224  // Early exit for reaping threads releasing forkjoin barrier
225  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
226  return;
227 // The worker thread may now assume that the team is valid.
228 #ifdef KMP_DEBUG
229  tid = __kmp_tid_from_gtid(gtid);
230  team = __kmp_threads[gtid]->th.th_team;
231 #endif
232  KMP_DEBUG_ASSERT(team != NULL);
233  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
234  KA_TRACE(20,
235  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
236  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
237  KMP_MB(); // Flush all pending memory write invalidates.
238  }
239  KA_TRACE(
240  20,
241  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
242  gtid, team->t.t_id, tid, bt));
243 }
244 
245 // Tree barrier
246 static void
247 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
248  int tid, void (*reduce)(void *, void *)
249  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
250  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
251  kmp_team_t *team = this_thr->th.th_team;
252  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
253  kmp_info_t **other_threads = team->t.t_threads;
254  kmp_uint32 nproc = this_thr->th.th_team_nproc;
255  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
256  kmp_uint32 branch_factor = 1 << branch_bits;
257  kmp_uint32 child;
258  kmp_uint32 child_tid;
259  kmp_uint64 new_state;
260 
261  KA_TRACE(
262  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
263  gtid, team->t.t_id, tid, bt));
264  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
265 
266 #if USE_ITT_BUILD && USE_ITT_NOTIFY
267  // Barrier imbalance - save arrive time to the thread
268  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
269  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
270  __itt_get_timestamp();
271  }
272 #endif
273  // Perform tree gather to wait until all threads have arrived; reduce any
274  // required data as we go
275  child_tid = (tid << branch_bits) + 1;
276  if (child_tid < nproc) {
277  // Parent threads wait for all their children to arrive
278  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
279  child = 1;
280  do {
281  kmp_info_t *child_thr = other_threads[child_tid];
282  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
283 #if KMP_CACHE_MANAGE
284  // Prefetch next thread's arrived count
285  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
286  KMP_CACHE_PREFETCH(
287  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
288 #endif /* KMP_CACHE_MANAGE */
289  KA_TRACE(20,
290  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
291  "arrived(%p) == %llu\n",
292  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
293  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
294  // Wait for child to arrive
295  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
296  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
297  ANNOTATE_BARRIER_END(child_thr);
298 #if USE_ITT_BUILD && USE_ITT_NOTIFY
299  // Barrier imbalance - write min of the thread time and a child time to
300  // the thread.
301  if (__kmp_forkjoin_frames_mode == 2) {
302  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
303  child_thr->th.th_bar_min_time);
304  }
305 #endif
306  if (reduce) {
307  KA_TRACE(100,
308  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
309  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
310  team->t.t_id, child_tid));
311  ANNOTATE_REDUCE_AFTER(reduce);
312  (*reduce)(this_thr->th.th_local.reduce_data,
313  child_thr->th.th_local.reduce_data);
314  ANNOTATE_REDUCE_BEFORE(reduce);
315  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
316  }
317  child++;
318  child_tid++;
319  } while (child <= branch_factor && child_tid < nproc);
320  }
321 
322  if (!KMP_MASTER_TID(tid)) { // Worker threads
323  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
324 
325  KA_TRACE(20,
326  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
327  "arrived(%p): %llu => %llu\n",
328  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
329  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
330  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
331 
332  // Mark arrival to parent thread
333  /* After performing this write, a worker thread may not assume that the team
334  is valid any more - it could be deallocated by the master thread at any
335  time. */
336  ANNOTATE_BARRIER_BEGIN(this_thr);
337  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
338  flag.release();
339  } else {
340  // Need to update the team arrived pointer if we are the master thread
341  if (nproc > 1) // New value was already computed above
342  team->t.t_bar[bt].b_arrived = new_state;
343  else
344  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
345  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
346  "arrived(%p) = %llu\n",
347  gtid, team->t.t_id, tid, team->t.t_id,
348  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
349  }
350  KA_TRACE(20,
351  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
352  gtid, team->t.t_id, tid, bt));
353 }
354 
355 static void __kmp_tree_barrier_release(
356  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
357  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
358  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
359  kmp_team_t *team;
360  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
361  kmp_uint32 nproc;
362  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
363  kmp_uint32 branch_factor = 1 << branch_bits;
364  kmp_uint32 child;
365  kmp_uint32 child_tid;
366 
367  // Perform a tree release for all of the threads that have been gathered
368  if (!KMP_MASTER_TID(
369  tid)) { // Handle fork barrier workers who aren't part of a team yet
370  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
371  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
372  // Wait for parent thread to release us
373  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
374  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
375  ANNOTATE_BARRIER_END(this_thr);
376 #if USE_ITT_BUILD && USE_ITT_NOTIFY
377  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
378  // In fork barrier where we could not get the object reliably (or
379  // ITTNOTIFY is disabled)
380  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
381  // Cancel wait on previous parallel region...
382  __kmp_itt_task_starting(itt_sync_obj);
383 
384  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
385  return;
386 
387  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
388  if (itt_sync_obj != NULL)
389  // Call prepare as early as possible for "new" barrier
390  __kmp_itt_task_finished(itt_sync_obj);
391  } else
392 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
393  // Early exit for reaping threads releasing forkjoin barrier
394  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
395  return;
396 
397  // The worker thread may now assume that the team is valid.
398  team = __kmp_threads[gtid]->th.th_team;
399  KMP_DEBUG_ASSERT(team != NULL);
400  tid = __kmp_tid_from_gtid(gtid);
401 
402  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
403  KA_TRACE(20,
404  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
405  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
406  KMP_MB(); // Flush all pending memory write invalidates.
407  } else {
408  team = __kmp_threads[gtid]->th.th_team;
409  KMP_DEBUG_ASSERT(team != NULL);
410  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
411  "barrier type %d\n",
412  gtid, team->t.t_id, tid, bt));
413  }
414  nproc = this_thr->th.th_team_nproc;
415  child_tid = (tid << branch_bits) + 1;
416 
417  if (child_tid < nproc) {
418  kmp_info_t **other_threads = team->t.t_threads;
419  child = 1;
420  // Parent threads release all their children
421  do {
422  kmp_info_t *child_thr = other_threads[child_tid];
423  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
424 #if KMP_CACHE_MANAGE
425  // Prefetch next thread's go count
426  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
427  KMP_CACHE_PREFETCH(
428  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
429 #endif /* KMP_CACHE_MANAGE */
430 
431 #if KMP_BARRIER_ICV_PUSH
432  {
433  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
434  if (propagate_icvs) {
435  __kmp_init_implicit_task(team->t.t_ident,
436  team->t.t_threads[child_tid], team,
437  child_tid, FALSE);
438  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
439  &team->t.t_implicit_task_taskdata[0].td_icvs);
440  }
441  }
442 #endif // KMP_BARRIER_ICV_PUSH
443  KA_TRACE(20,
444  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
445  "go(%p): %u => %u\n",
446  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
447  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
448  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
449  // Release child from barrier
450  ANNOTATE_BARRIER_BEGIN(child_thr);
451  kmp_flag_64 flag(&child_bar->b_go, child_thr);
452  flag.release();
453  child++;
454  child_tid++;
455  } while (child <= branch_factor && child_tid < nproc);
456  }
457  KA_TRACE(
458  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
459  gtid, team->t.t_id, tid, bt));
460 }
461 
462 // Hyper Barrier
463 static void
464 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
465  int tid, void (*reduce)(void *, void *)
466  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
467  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
468  kmp_team_t *team = this_thr->th.th_team;
469  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
470  kmp_info_t **other_threads = team->t.t_threads;
471  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
472  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
473  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
474  kmp_uint32 branch_factor = 1 << branch_bits;
475  kmp_uint32 offset;
476  kmp_uint32 level;
477 
478  KA_TRACE(
479  20,
480  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
481  gtid, team->t.t_id, tid, bt));
482  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
483 
484 #if USE_ITT_BUILD && USE_ITT_NOTIFY
485  // Barrier imbalance - save arrive time to the thread
486  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
487  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
488  __itt_get_timestamp();
489  }
490 #endif
491  /* Perform a hypercube-embedded tree gather to wait until all of the threads
492  have arrived, and reduce any required data as we go. */
493  kmp_flag_64 p_flag(&thr_bar->b_arrived);
494  for (level = 0, offset = 1; offset < num_threads;
495  level += branch_bits, offset <<= branch_bits) {
496  kmp_uint32 child;
497  kmp_uint32 child_tid;
498 
499  if (((tid >> level) & (branch_factor - 1)) != 0) {
500  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
501 
502  KA_TRACE(20,
503  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
504  "arrived(%p): %llu => %llu\n",
505  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
506  team->t.t_id, parent_tid, &thr_bar->b_arrived,
507  thr_bar->b_arrived,
508  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
509  // Mark arrival to parent thread
510  /* After performing this write (in the last iteration of the enclosing for
511  loop), a worker thread may not assume that the team is valid any more
512  - it could be deallocated by the master thread at any time. */
513  ANNOTATE_BARRIER_BEGIN(this_thr);
514  p_flag.set_waiter(other_threads[parent_tid]);
515  p_flag.release();
516  break;
517  }
518 
519  // Parent threads wait for children to arrive
520  if (new_state == KMP_BARRIER_UNUSED_STATE)
521  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
522  for (child = 1, child_tid = tid + (1 << level);
523  child < branch_factor && child_tid < num_threads;
524  child++, child_tid += (1 << level)) {
525  kmp_info_t *child_thr = other_threads[child_tid];
526  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
527 #if KMP_CACHE_MANAGE
528  kmp_uint32 next_child_tid = child_tid + (1 << level);
529  // Prefetch next thread's arrived count
530  if (child + 1 < branch_factor && next_child_tid < num_threads)
531  KMP_CACHE_PREFETCH(
532  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
533 #endif /* KMP_CACHE_MANAGE */
534  KA_TRACE(20,
535  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
536  "arrived(%p) == %llu\n",
537  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
538  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
539  // Wait for child to arrive
540  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
541  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
542  ANNOTATE_BARRIER_END(child_thr);
543 #if USE_ITT_BUILD && USE_ITT_NOTIFY
544  // Barrier imbalance - write min of the thread time and a child time to
545  // the thread.
546  if (__kmp_forkjoin_frames_mode == 2) {
547  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
548  child_thr->th.th_bar_min_time);
549  }
550 #endif
551  if (reduce) {
552  KA_TRACE(100,
553  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
554  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
555  team->t.t_id, child_tid));
556  ANNOTATE_REDUCE_AFTER(reduce);
557  (*reduce)(this_thr->th.th_local.reduce_data,
558  child_thr->th.th_local.reduce_data);
559  ANNOTATE_REDUCE_BEFORE(reduce);
560  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
561  }
562  }
563  }
564 
565  if (KMP_MASTER_TID(tid)) {
566  // Need to update the team arrived pointer if we are the master thread
567  if (new_state == KMP_BARRIER_UNUSED_STATE)
568  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
569  else
570  team->t.t_bar[bt].b_arrived = new_state;
571  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
572  "arrived(%p) = %llu\n",
573  gtid, team->t.t_id, tid, team->t.t_id,
574  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
575  }
576  KA_TRACE(
577  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
578  gtid, team->t.t_id, tid, bt));
579 }
580 
581 // The reverse versions seem to beat the forward versions overall
582 #define KMP_REVERSE_HYPER_BAR
583 static void __kmp_hyper_barrier_release(
584  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
585  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
586  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
587  kmp_team_t *team;
588  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
589  kmp_info_t **other_threads;
590  kmp_uint32 num_threads;
591  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
592  kmp_uint32 branch_factor = 1 << branch_bits;
593  kmp_uint32 child;
594  kmp_uint32 child_tid;
595  kmp_uint32 offset;
596  kmp_uint32 level;
597 
598  /* Perform a hypercube-embedded tree release for all of the threads that have
599  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
600  are released in the reverse order of the corresponding gather, otherwise
601  threads are released in the same order. */
602  if (KMP_MASTER_TID(tid)) { // master
603  team = __kmp_threads[gtid]->th.th_team;
604  KMP_DEBUG_ASSERT(team != NULL);
605  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
606  "barrier type %d\n",
607  gtid, team->t.t_id, tid, bt));
608 #if KMP_BARRIER_ICV_PUSH
609  if (propagate_icvs) { // master already has ICVs in final destination; copy
610  copy_icvs(&thr_bar->th_fixed_icvs,
611  &team->t.t_implicit_task_taskdata[tid].td_icvs);
612  }
613 #endif
614  } else { // Handle fork barrier workers who aren't part of a team yet
615  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
616  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
617  // Wait for parent thread to release us
618  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
619  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
620  ANNOTATE_BARRIER_END(this_thr);
621 #if USE_ITT_BUILD && USE_ITT_NOTIFY
622  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
623  // In fork barrier where we could not get the object reliably
624  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
625  // Cancel wait on previous parallel region...
626  __kmp_itt_task_starting(itt_sync_obj);
627 
628  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
629  return;
630 
631  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
632  if (itt_sync_obj != NULL)
633  // Call prepare as early as possible for "new" barrier
634  __kmp_itt_task_finished(itt_sync_obj);
635  } else
636 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
637  // Early exit for reaping threads releasing forkjoin barrier
638  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
639  return;
640 
641  // The worker thread may now assume that the team is valid.
642  team = __kmp_threads[gtid]->th.th_team;
643  KMP_DEBUG_ASSERT(team != NULL);
644  tid = __kmp_tid_from_gtid(gtid);
645 
646  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
647  KA_TRACE(20,
648  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
649  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
650  KMP_MB(); // Flush all pending memory write invalidates.
651  }
652  num_threads = this_thr->th.th_team_nproc;
653  other_threads = team->t.t_threads;
654 
655 #ifdef KMP_REVERSE_HYPER_BAR
656  // Count up to correct level for parent
657  for (level = 0, offset = 1;
658  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
659  level += branch_bits, offset <<= branch_bits)
660  ;
661 
662  // Now go down from there
663  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
664  level -= branch_bits, offset >>= branch_bits)
665 #else
666  // Go down the tree, level by level
667  for (level = 0, offset = 1; offset < num_threads;
668  level += branch_bits, offset <<= branch_bits)
669 #endif // KMP_REVERSE_HYPER_BAR
670  {
671 #ifdef KMP_REVERSE_HYPER_BAR
672  /* Now go in reverse order through the children, highest to lowest.
673  Initial setting of child is conservative here. */
674  child = num_threads >> ((level == 0) ? level : level - 1);
675  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
676  child_tid = tid + (child << level);
677  child >= 1; child--, child_tid -= (1 << level))
678 #else
679  if (((tid >> level) & (branch_factor - 1)) != 0)
680  // No need to go lower than this, since this is the level parent would be
681  // notified
682  break;
683  // Iterate through children on this level of the tree
684  for (child = 1, child_tid = tid + (1 << level);
685  child < branch_factor && child_tid < num_threads;
686  child++, child_tid += (1 << level))
687 #endif // KMP_REVERSE_HYPER_BAR
688  {
689  if (child_tid >= num_threads)
690  continue; // Child doesn't exist so keep going
691  else {
692  kmp_info_t *child_thr = other_threads[child_tid];
693  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
694 #if KMP_CACHE_MANAGE
695  kmp_uint32 next_child_tid = child_tid - (1 << level);
696 // Prefetch next thread's go count
697 #ifdef KMP_REVERSE_HYPER_BAR
698  if (child - 1 >= 1 && next_child_tid < num_threads)
699 #else
700  if (child + 1 < branch_factor && next_child_tid < num_threads)
701 #endif // KMP_REVERSE_HYPER_BAR
702  KMP_CACHE_PREFETCH(
703  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
704 #endif /* KMP_CACHE_MANAGE */
705 
706 #if KMP_BARRIER_ICV_PUSH
707  if (propagate_icvs) // push my fixed ICVs to my child
708  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
709 #endif // KMP_BARRIER_ICV_PUSH
710 
711  KA_TRACE(
712  20,
713  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
714  "go(%p): %u => %u\n",
715  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
716  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
717  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
718  // Release child from barrier
719  ANNOTATE_BARRIER_BEGIN(child_thr);
720  kmp_flag_64 flag(&child_bar->b_go, child_thr);
721  flag.release();
722  }
723  }
724  }
725 #if KMP_BARRIER_ICV_PUSH
726  if (propagate_icvs &&
727  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
728  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
729  FALSE);
730  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
731  &thr_bar->th_fixed_icvs);
732  }
733 #endif
734  KA_TRACE(
735  20,
736  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
737  gtid, team->t.t_id, tid, bt));
738 }
739 
740 // Hierarchical Barrier
741 
742 // Initialize thread barrier data
743 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
744  Performs the minimum amount of initialization required based on how the team
745  has changed. Returns true if leaf children will require both on-core and
746  traditional wake-up mechanisms. For example, if the team size increases,
747  threads already in the team will respond to on-core wakeup on their parent
748  thread, but threads newly added to the team will only be listening on the
749  their local b_go. */
750 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
751  kmp_bstate_t *thr_bar,
752  kmp_uint32 nproc, int gtid,
753  int tid, kmp_team_t *team) {
754  // Checks to determine if (re-)initialization is needed
755  bool uninitialized = thr_bar->team == NULL;
756  bool team_changed = team != thr_bar->team;
757  bool team_sz_changed = nproc != thr_bar->nproc;
758  bool tid_changed = tid != thr_bar->old_tid;
759  bool retval = false;
760 
761  if (uninitialized || team_sz_changed) {
762  __kmp_get_hierarchy(nproc, thr_bar);
763  }
764 
765  if (uninitialized || team_sz_changed || tid_changed) {
766  thr_bar->my_level = thr_bar->depth - 1; // default for master
767  thr_bar->parent_tid = -1; // default for master
768  if (!KMP_MASTER_TID(
769  tid)) { // if not master, find parent thread in hierarchy
770  kmp_uint32 d = 0;
771  while (d < thr_bar->depth) { // find parent based on level of thread in
772  // hierarchy, and note level
773  kmp_uint32 rem;
774  if (d == thr_bar->depth - 2) { // reached level right below the master
775  thr_bar->parent_tid = 0;
776  thr_bar->my_level = d;
777  break;
778  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
779  0) { // TODO: can we make this op faster?
780  // thread is not a subtree root at next level, so this is max
781  thr_bar->parent_tid = tid - rem;
782  thr_bar->my_level = d;
783  break;
784  }
785  ++d;
786  }
787  }
788  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
789  thr_bar->old_tid = tid;
790  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
791  thr_bar->team = team;
792  thr_bar->parent_bar =
793  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
794  }
795  if (uninitialized || team_changed || tid_changed) {
796  thr_bar->team = team;
797  thr_bar->parent_bar =
798  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
799  retval = true;
800  }
801  if (uninitialized || team_sz_changed || tid_changed) {
802  thr_bar->nproc = nproc;
803  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
804  if (thr_bar->my_level == 0)
805  thr_bar->leaf_kids = 0;
806  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
807  thr_bar->leaf_kids = nproc - tid - 1;
808  thr_bar->leaf_state = 0;
809  for (int i = 0; i < thr_bar->leaf_kids; ++i)
810  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
811  }
812  return retval;
813 }
814 
815 static void __kmp_hierarchical_barrier_gather(
816  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
817  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
818  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
819  kmp_team_t *team = this_thr->th.th_team;
820  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
821  kmp_uint32 nproc = this_thr->th.th_team_nproc;
822  kmp_info_t **other_threads = team->t.t_threads;
823  kmp_uint64 new_state;
824 
825  int level = team->t.t_level;
826 #if OMP_40_ENABLED
827  if (other_threads[0]
828  ->th.th_teams_microtask) // are we inside the teams construct?
829  if (this_thr->th.th_teams_size.nteams > 1)
830  ++level; // level was not increased in teams construct for team_of_masters
831 #endif
832  if (level == 1)
833  thr_bar->use_oncore_barrier = 1;
834  else
835  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
836 
837  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
838  "barrier type %d\n",
839  gtid, team->t.t_id, tid, bt));
840  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
841 
842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
843  // Barrier imbalance - save arrive time to the thread
844  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
845  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
846  }
847 #endif
848 
849  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
850  team);
851 
852  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
853  kmp_int32 child_tid;
854  new_state =
855  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
856  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
857  thr_bar->use_oncore_barrier) {
858  if (thr_bar->leaf_kids) {
859  // First, wait for leaf children to check-in on my b_arrived flag
860  kmp_uint64 leaf_state =
861  KMP_MASTER_TID(tid)
862  ? thr_bar->b_arrived | thr_bar->leaf_state
863  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
864  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
865  "for leaf kids\n",
866  gtid, team->t.t_id, tid));
867  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
868  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
869  if (reduce) {
870  ANNOTATE_REDUCE_AFTER(reduce);
871  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
872  ++child_tid) {
873  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
874  "T#%d(%d:%d)\n",
875  gtid, team->t.t_id, tid,
876  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
877  child_tid));
878  ANNOTATE_BARRIER_END(other_threads[child_tid]);
879  (*reduce)(this_thr->th.th_local.reduce_data,
880  other_threads[child_tid]->th.th_local.reduce_data);
881  }
882  ANNOTATE_REDUCE_BEFORE(reduce);
883  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
884  }
885  // clear leaf_state bits
886  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
887  }
888  // Next, wait for higher level children on each child's b_arrived flag
889  for (kmp_uint32 d = 1; d < thr_bar->my_level;
890  ++d) { // gather lowest level threads first, but skip 0
891  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
892  skip = thr_bar->skip_per_level[d];
893  if (last > nproc)
894  last = nproc;
895  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
896  kmp_info_t *child_thr = other_threads[child_tid];
897  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
898  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
899  "T#%d(%d:%d) "
900  "arrived(%p) == %llu\n",
901  gtid, team->t.t_id, tid,
902  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
903  child_tid, &child_bar->b_arrived, new_state));
904  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
905  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
906  ANNOTATE_BARRIER_END(child_thr);
907  if (reduce) {
908  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
909  "T#%d(%d:%d)\n",
910  gtid, team->t.t_id, tid,
911  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
912  child_tid));
913  ANNOTATE_REDUCE_AFTER(reduce);
914  (*reduce)(this_thr->th.th_local.reduce_data,
915  child_thr->th.th_local.reduce_data);
916  ANNOTATE_REDUCE_BEFORE(reduce);
917  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
918  }
919  }
920  }
921  } else { // Blocktime is not infinite
922  for (kmp_uint32 d = 0; d < thr_bar->my_level;
923  ++d) { // Gather lowest level threads first
924  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
925  skip = thr_bar->skip_per_level[d];
926  if (last > nproc)
927  last = nproc;
928  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
929  kmp_info_t *child_thr = other_threads[child_tid];
930  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
931  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
932  "T#%d(%d:%d) "
933  "arrived(%p) == %llu\n",
934  gtid, team->t.t_id, tid,
935  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
936  child_tid, &child_bar->b_arrived, new_state));
937  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
938  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
939  ANNOTATE_BARRIER_END(child_thr);
940  if (reduce) {
941  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
942  "T#%d(%d:%d)\n",
943  gtid, team->t.t_id, tid,
944  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
945  child_tid));
946  ANNOTATE_REDUCE_AFTER(reduce);
947  (*reduce)(this_thr->th.th_local.reduce_data,
948  child_thr->th.th_local.reduce_data);
949  ANNOTATE_REDUCE_BEFORE(reduce);
950  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
951  }
952  }
953  }
954  }
955  }
956  // All subordinates are gathered; now release parent if not master thread
957 
958  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
959  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
960  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
961  gtid, team->t.t_id, tid,
962  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
963  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
964  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
965  /* Mark arrival to parent: After performing this write, a worker thread may
966  not assume that the team is valid any more - it could be deallocated by
967  the master thread at any time. */
968  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
969  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
970  // flag; release it
971  ANNOTATE_BARRIER_BEGIN(this_thr);
972  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
973  flag.release();
974  } else {
975  // Leaf does special release on "offset" bits of parent's b_arrived flag
976  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
977  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
978  flag.set_waiter(other_threads[thr_bar->parent_tid]);
979  flag.release();
980  }
981  } else { // Master thread needs to update the team's b_arrived value
982  team->t.t_bar[bt].b_arrived = new_state;
983  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
984  "arrived(%p) = %llu\n",
985  gtid, team->t.t_id, tid, team->t.t_id,
986  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
987  }
988  // Is the team access below unsafe or just technically invalid?
989  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
990  "barrier type %d\n",
991  gtid, team->t.t_id, tid, bt));
992 }
993 
994 static void __kmp_hierarchical_barrier_release(
995  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
996  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
997  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
998  kmp_team_t *team;
999  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1000  kmp_uint32 nproc;
1001  bool team_change = false; // indicates on-core barrier shouldn't be used
1002 
1003  if (KMP_MASTER_TID(tid)) {
1004  team = __kmp_threads[gtid]->th.th_team;
1005  KMP_DEBUG_ASSERT(team != NULL);
1006  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1007  "entered barrier type %d\n",
1008  gtid, team->t.t_id, tid, bt));
1009  } else { // Worker threads
1010  // Wait for parent thread to release me
1011  if (!thr_bar->use_oncore_barrier ||
1012  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1013  thr_bar->team == NULL) {
1014  // Use traditional method of waiting on my own b_go flag
1015  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1016  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1017  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1018  ANNOTATE_BARRIER_END(this_thr);
1019  TCW_8(thr_bar->b_go,
1020  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1021  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1022  // infinite, not nested
1023  // Wait on my "offset" bits on parent's b_go flag
1024  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1025  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1026  thr_bar->offset, bt,
1027  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1028  flag.wait(this_thr, TRUE);
1029  if (thr_bar->wait_flag ==
1030  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1031  TCW_8(thr_bar->b_go,
1032  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1033  } else { // Reset my bits on parent's b_go flag
1034  (RCAST(volatile char *,
1035  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1036  }
1037  }
1038  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1039  // Early exit for reaping threads releasing forkjoin barrier
1040  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1041  return;
1042  // The worker thread may now assume that the team is valid.
1043  team = __kmp_threads[gtid]->th.th_team;
1044  KMP_DEBUG_ASSERT(team != NULL);
1045  tid = __kmp_tid_from_gtid(gtid);
1046 
1047  KA_TRACE(
1048  20,
1049  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1050  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1051  KMP_MB(); // Flush all pending memory write invalidates.
1052  }
1053 
1054  nproc = this_thr->th.th_team_nproc;
1055  int level = team->t.t_level;
1056 #if OMP_40_ENABLED
1057  if (team->t.t_threads[0]
1058  ->th.th_teams_microtask) { // are we inside the teams construct?
1059  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1060  this_thr->th.th_teams_level == level)
1061  ++level; // level was not increased in teams construct for team_of_workers
1062  if (this_thr->th.th_teams_size.nteams > 1)
1063  ++level; // level was not increased in teams construct for team_of_masters
1064  }
1065 #endif
1066  if (level == 1)
1067  thr_bar->use_oncore_barrier = 1;
1068  else
1069  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1070 
1071  // If the team size has increased, we still communicate with old leaves via
1072  // oncore barrier.
1073  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1074  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1075  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1076  tid, team);
1077  // But if the entire team changes, we won't use oncore barrier at all
1078  if (team_change)
1079  old_leaf_kids = 0;
1080 
1081 #if KMP_BARRIER_ICV_PUSH
1082  if (propagate_icvs) {
1083  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1084  FALSE);
1085  if (KMP_MASTER_TID(
1086  tid)) { // master already has copy in final destination; copy
1087  copy_icvs(&thr_bar->th_fixed_icvs,
1088  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1089  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1090  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1091  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1092  // leaves (on-core children) pull parent's fixed ICVs directly to local
1093  // ICV store
1094  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1095  &thr_bar->parent_bar->th_fixed_icvs);
1096  // non-leaves will get ICVs piggybacked with b_go via NGO store
1097  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1098  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1099  // access
1100  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1101  else // leaves copy parent's fixed ICVs directly to local ICV store
1102  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1103  &thr_bar->parent_bar->th_fixed_icvs);
1104  }
1105  }
1106 #endif // KMP_BARRIER_ICV_PUSH
1107 
1108  // Now, release my children
1109  if (thr_bar->my_level) { // not a leaf
1110  kmp_int32 child_tid;
1111  kmp_uint32 last;
1112  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1113  thr_bar->use_oncore_barrier) {
1114  if (KMP_MASTER_TID(tid)) { // do a flat release
1115  // Set local b_go to bump children via NGO store of the cache line
1116  // containing IVCs and b_go.
1117  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1118  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1119  // the cache line
1120  ngo_load(&thr_bar->th_fixed_icvs);
1121  // This loops over all the threads skipping only the leaf nodes in the
1122  // hierarchy
1123  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1124  child_tid += thr_bar->skip_per_level[1]) {
1125  kmp_bstate_t *child_bar =
1126  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1127  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1128  "releasing T#%d(%d:%d)"
1129  " go(%p): %u => %u\n",
1130  gtid, team->t.t_id, tid,
1131  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1132  child_tid, &child_bar->b_go, child_bar->b_go,
1133  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1134  // Use ngo store (if available) to both store ICVs and release child
1135  // via child's b_go
1136  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1137  }
1138  ngo_sync();
1139  }
1140  TCW_8(thr_bar->b_go,
1141  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1142  // Now, release leaf children
1143  if (thr_bar->leaf_kids) { // if there are any
1144  // We test team_change on the off-chance that the level 1 team changed.
1145  if (team_change ||
1146  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1147  if (old_leaf_kids) { // release old leaf kids
1148  thr_bar->b_go |= old_leaf_state;
1149  }
1150  // Release new leaf kids
1151  last = tid + thr_bar->skip_per_level[1];
1152  if (last > nproc)
1153  last = nproc;
1154  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1155  ++child_tid) { // skip_per_level[0]=1
1156  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1157  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1158  KA_TRACE(
1159  20,
1160  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1161  " T#%d(%d:%d) go(%p): %u => %u\n",
1162  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1163  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1164  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1165  // Release child using child's b_go flag
1166  ANNOTATE_BARRIER_BEGIN(child_thr);
1167  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1168  flag.release();
1169  }
1170  } else { // Release all children at once with leaf_state bits on my own
1171  // b_go flag
1172  thr_bar->b_go |= thr_bar->leaf_state;
1173  }
1174  }
1175  } else { // Blocktime is not infinite; do a simple hierarchical release
1176  for (int d = thr_bar->my_level - 1; d >= 0;
1177  --d) { // Release highest level threads first
1178  last = tid + thr_bar->skip_per_level[d + 1];
1179  kmp_uint32 skip = thr_bar->skip_per_level[d];
1180  if (last > nproc)
1181  last = nproc;
1182  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1183  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1184  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1185  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1186  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1187  gtid, team->t.t_id, tid,
1188  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1189  child_tid, &child_bar->b_go, child_bar->b_go,
1190  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1191  // Release child using child's b_go flag
1192  ANNOTATE_BARRIER_BEGIN(child_thr);
1193  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1194  flag.release();
1195  }
1196  }
1197  }
1198 #if KMP_BARRIER_ICV_PUSH
1199  if (propagate_icvs && !KMP_MASTER_TID(tid))
1200  // non-leaves copy ICVs from fixed ICVs to local dest
1201  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1202  &thr_bar->th_fixed_icvs);
1203 #endif // KMP_BARRIER_ICV_PUSH
1204  }
1205  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1206  "barrier type %d\n",
1207  gtid, team->t.t_id, tid, bt));
1208 }
1209 
1210 // End of Barrier Algorithms
1211 
1212 // Internal function to do a barrier.
1213 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1214  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1215  barrier
1216  Returns 0 if master thread, 1 if worker thread. */
1217 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1218  size_t reduce_size, void *reduce_data,
1219  void (*reduce)(void *, void *)) {
1220  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1221  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1222  int tid = __kmp_tid_from_gtid(gtid);
1223  kmp_info_t *this_thr = __kmp_threads[gtid];
1224  kmp_team_t *team = this_thr->th.th_team;
1225  int status = 0;
1226 #if OMPT_SUPPORT && OMPT_OPTIONAL
1227  ompt_data_t *my_task_data;
1228  ompt_data_t *my_parallel_data;
1229  void *return_address;
1230 #endif
1231 
1232  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1233  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1234 
1235  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1236 #if OMPT_SUPPORT
1237  if (ompt_enabled.enabled) {
1238 #if OMPT_OPTIONAL
1239  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1240  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1241  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1242  if (ompt_enabled.ompt_callback_sync_region) {
1243  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1244  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1245  my_task_data, return_address);
1246  }
1247  if (ompt_enabled.ompt_callback_sync_region_wait) {
1248  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1249  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1250  my_task_data, return_address);
1251  }
1252 #endif
1253  // It is OK to report the barrier state after the barrier begin callback.
1254  // According to the OMPT specification, a compliant implementation may
1255  // even delay reporting this state until the barrier begins to wait.
1256  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1257  }
1258 #endif
1259 
1260  if (!team->t.t_serialized) {
1261 #if USE_ITT_BUILD
1262  // This value will be used in itt notify events below.
1263  void *itt_sync_obj = NULL;
1264 #if USE_ITT_NOTIFY
1265  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1266  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1267 #endif
1268 #endif /* USE_ITT_BUILD */
1269  if (__kmp_tasking_mode == tskm_extra_barrier) {
1270  __kmp_tasking_barrier(team, this_thr, gtid);
1271  KA_TRACE(15,
1272  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1273  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1274  }
1275 
1276  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1277  access it when the team struct is not guaranteed to exist. */
1278  // See note about the corresponding code in __kmp_join_barrier() being
1279  // performance-critical.
1280  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1281 #if KMP_USE_MONITOR
1282  this_thr->th.th_team_bt_intervals =
1283  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1284  this_thr->th.th_team_bt_set =
1285  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1286 #else
1287  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1288 #endif
1289  }
1290 
1291 #if USE_ITT_BUILD
1292  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1293  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1294 #endif /* USE_ITT_BUILD */
1295 #if USE_DEBUGGER
1296  // Let the debugger know: the thread arrived to the barrier and waiting.
1297  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1298  team->t.t_bar[bt].b_master_arrived += 1;
1299  } else {
1300  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1301  } // if
1302 #endif /* USE_DEBUGGER */
1303  if (reduce != NULL) {
1304  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1305  this_thr->th.th_local.reduce_data = reduce_data;
1306  }
1307 
1308  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1309  __kmp_task_team_setup(
1310  this_thr, team,
1311  0); // use 0 to only setup the current team if nthreads > 1
1312 
1313  switch (__kmp_barrier_gather_pattern[bt]) {
1314  case bp_hyper_bar: {
1315  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1316  // to 0; use linear
1317  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1318  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1319  break;
1320  }
1321  case bp_hierarchical_bar: {
1322  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid,
1323  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1324  break;
1325  }
1326  case bp_tree_bar: {
1327  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1328  // to 0; use linear
1329  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1330  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1331  break;
1332  }
1333  default: {
1334  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1335  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1336  }
1337  }
1338 
1339  KMP_MB();
1340 
1341  if (KMP_MASTER_TID(tid)) {
1342  status = 0;
1343  if (__kmp_tasking_mode != tskm_immediate_exec) {
1344  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1345  }
1346 #if USE_DEBUGGER
1347  // Let the debugger know: All threads are arrived and starting leaving the
1348  // barrier.
1349  team->t.t_bar[bt].b_team_arrived += 1;
1350 #endif
1351 
1352 #if OMP_40_ENABLED
1353  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1354  // Reset cancellation flag for worksharing constructs
1355  if (cancel_request == cancel_loop || cancel_request == cancel_sections) {
1356  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1357  }
1358 #endif
1359 #if USE_ITT_BUILD
1360  /* TODO: In case of split reduction barrier, master thread may send
1361  acquired event early, before the final summation into the shared
1362  variable is done (final summation can be a long operation for array
1363  reductions). */
1364  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1365  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1366 #endif /* USE_ITT_BUILD */
1367 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1368  // Barrier - report frame end (only if active_level == 1)
1369  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1370  __kmp_forkjoin_frames_mode &&
1371 #if OMP_40_ENABLED
1372  this_thr->th.th_teams_microtask == NULL &&
1373 #endif
1374  team->t.t_active_level == 1) {
1375  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1376  kmp_uint64 cur_time = __itt_get_timestamp();
1377  kmp_info_t **other_threads = team->t.t_threads;
1378  int nproc = this_thr->th.th_team_nproc;
1379  int i;
1380  switch (__kmp_forkjoin_frames_mode) {
1381  case 1:
1382  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1383  loc, nproc);
1384  this_thr->th.th_frame_time = cur_time;
1385  break;
1386  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1387  // be fixed)
1388  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1389  1, loc, nproc);
1390  break;
1391  case 3:
1392  if (__itt_metadata_add_ptr) {
1393  // Initialize with master's wait time
1394  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1395  // Set arrive time to zero to be able to check it in
1396  // __kmp_invoke_task(); the same is done inside the loop below
1397  this_thr->th.th_bar_arrive_time = 0;
1398  for (i = 1; i < nproc; ++i) {
1399  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1400  other_threads[i]->th.th_bar_arrive_time = 0;
1401  }
1402  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1403  cur_time, delta,
1404  (kmp_uint64)(reduce != NULL));
1405  }
1406  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1407  loc, nproc);
1408  this_thr->th.th_frame_time = cur_time;
1409  break;
1410  }
1411  }
1412 #endif /* USE_ITT_BUILD */
1413  } else {
1414  status = 1;
1415 #if USE_ITT_BUILD
1416  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1417  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1418 #endif /* USE_ITT_BUILD */
1419  }
1420  if (status == 1 || !is_split) {
1421  switch (__kmp_barrier_release_pattern[bt]) {
1422  case bp_hyper_bar: {
1423  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1424  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1425  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1426  break;
1427  }
1428  case bp_hierarchical_bar: {
1429  __kmp_hierarchical_barrier_release(
1430  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1431  break;
1432  }
1433  case bp_tree_bar: {
1434  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1435  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1436  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1437  break;
1438  }
1439  default: {
1440  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1441  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1442  }
1443  }
1444  if (__kmp_tasking_mode != tskm_immediate_exec) {
1445  __kmp_task_team_sync(this_thr, team);
1446  }
1447  }
1448 
1449 #if USE_ITT_BUILD
1450  /* GEH: TODO: Move this under if-condition above and also include in
1451  __kmp_end_split_barrier(). This will more accurately represent the actual
1452  release time of the threads for split barriers. */
1453  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1454  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1455 #endif /* USE_ITT_BUILD */
1456  } else { // Team is serialized.
1457  status = 0;
1458  if (__kmp_tasking_mode != tskm_immediate_exec) {
1459 #if OMP_45_ENABLED
1460  if (this_thr->th.th_task_team != NULL) {
1461 #if USE_ITT_NOTIFY
1462  void *itt_sync_obj = NULL;
1463  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1464  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1465  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1466  }
1467 #endif
1468 
1469  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1470  TRUE);
1471  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1472  __kmp_task_team_setup(this_thr, team, 0);
1473 
1474 #if USE_ITT_BUILD
1475  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1476  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1477 #endif /* USE_ITT_BUILD */
1478  }
1479 #else
1480  // The task team should be NULL for serialized code (tasks will be
1481  // executed immediately)
1482  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1483  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1484 #endif
1485  }
1486  }
1487  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1488  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1489  __kmp_tid_from_gtid(gtid), status));
1490 
1491 #if OMPT_SUPPORT
1492  if (ompt_enabled.enabled) {
1493 #if OMPT_OPTIONAL
1494  if (ompt_enabled.ompt_callback_sync_region_wait) {
1495  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1496  ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1497  my_task_data, return_address);
1498  }
1499  if (ompt_enabled.ompt_callback_sync_region) {
1500  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1501  ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1502  my_task_data, return_address);
1503  }
1504 #endif
1505  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1506  }
1507 #endif
1508  ANNOTATE_BARRIER_END(&team->t.t_bar);
1509 
1510  return status;
1511 }
1512 
1513 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1514  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1515  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1516  int tid = __kmp_tid_from_gtid(gtid);
1517  kmp_info_t *this_thr = __kmp_threads[gtid];
1518  kmp_team_t *team = this_thr->th.th_team;
1519 
1520  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1521  if (!team->t.t_serialized) {
1522  if (KMP_MASTER_GTID(gtid)) {
1523  switch (__kmp_barrier_release_pattern[bt]) {
1524  case bp_hyper_bar: {
1525  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1526  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1527  FALSE USE_ITT_BUILD_ARG(NULL));
1528  break;
1529  }
1530  case bp_hierarchical_bar: {
1531  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1532  FALSE USE_ITT_BUILD_ARG(NULL));
1533  break;
1534  }
1535  case bp_tree_bar: {
1536  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1537  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1538  FALSE USE_ITT_BUILD_ARG(NULL));
1539  break;
1540  }
1541  default: {
1542  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1543  FALSE USE_ITT_BUILD_ARG(NULL));
1544  }
1545  }
1546  if (__kmp_tasking_mode != tskm_immediate_exec) {
1547  __kmp_task_team_sync(this_thr, team);
1548  } // if
1549  }
1550  }
1551  ANNOTATE_BARRIER_END(&team->t.t_bar);
1552 }
1553 
1554 void __kmp_join_barrier(int gtid) {
1555  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1556  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1557  kmp_info_t *this_thr = __kmp_threads[gtid];
1558  kmp_team_t *team;
1559  kmp_uint nproc;
1560  kmp_info_t *master_thread;
1561  int tid;
1562 #ifdef KMP_DEBUG
1563  int team_id;
1564 #endif /* KMP_DEBUG */
1565 #if USE_ITT_BUILD
1566  void *itt_sync_obj = NULL;
1567 #if USE_ITT_NOTIFY
1568  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1569  // Get object created at fork_barrier
1570  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1571 #endif
1572 #endif /* USE_ITT_BUILD */
1573  KMP_MB();
1574 
1575  // Get current info
1576  team = this_thr->th.th_team;
1577  nproc = this_thr->th.th_team_nproc;
1578  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1579  tid = __kmp_tid_from_gtid(gtid);
1580 #ifdef KMP_DEBUG
1581  team_id = team->t.t_id;
1582 #endif /* KMP_DEBUG */
1583  master_thread = this_thr->th.th_team_master;
1584 #ifdef KMP_DEBUG
1585  if (master_thread != team->t.t_threads[0]) {
1586  __kmp_print_structure();
1587  }
1588 #endif /* KMP_DEBUG */
1589  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1590  KMP_MB();
1591 
1592  // Verify state
1593  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1594  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1595  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1596  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1597  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1598  gtid, team_id, tid));
1599 
1600  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1601 #if OMPT_SUPPORT
1602  if (ompt_enabled.enabled) {
1603 #if OMPT_OPTIONAL
1604  ompt_data_t *my_task_data;
1605  ompt_data_t *my_parallel_data;
1606  void *codeptr = NULL;
1607  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1608  if (KMP_MASTER_TID(ds_tid) &&
1609  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1610  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1611  codeptr = team->t.ompt_team_info.master_return_address;
1612  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1613  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1614  if (ompt_enabled.ompt_callback_sync_region) {
1615  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1616  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1617  my_task_data, codeptr);
1618  }
1619  if (ompt_enabled.ompt_callback_sync_region_wait) {
1620  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1621  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1622  my_task_data, codeptr);
1623  }
1624  if (!KMP_MASTER_TID(ds_tid))
1625  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1626 #endif
1627  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1628  }
1629 #endif
1630 
1631  if (__kmp_tasking_mode == tskm_extra_barrier) {
1632  __kmp_tasking_barrier(team, this_thr, gtid);
1633  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1634  team_id, tid));
1635  }
1636 #ifdef KMP_DEBUG
1637  if (__kmp_tasking_mode != tskm_immediate_exec) {
1638  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1639  "%p, th_task_team = %p\n",
1640  __kmp_gtid_from_thread(this_thr), team_id,
1641  team->t.t_task_team[this_thr->th.th_task_state],
1642  this_thr->th.th_task_team));
1643  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1644  team->t.t_task_team[this_thr->th.th_task_state]);
1645  }
1646 #endif /* KMP_DEBUG */
1647 
1648  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1649  access it when the team struct is not guaranteed to exist. Doing these
1650  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1651  we do not perform the copy if blocktime=infinite, since the values are not
1652  used by __kmp_wait_template() in that case. */
1653  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1654 #if KMP_USE_MONITOR
1655  this_thr->th.th_team_bt_intervals =
1656  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1657  this_thr->th.th_team_bt_set =
1658  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1659 #else
1660  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1661 #endif
1662  }
1663 
1664 #if USE_ITT_BUILD
1665  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1666  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1667 #endif /* USE_ITT_BUILD */
1668 
1669  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1670  case bp_hyper_bar: {
1671  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1672  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1673  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1674  break;
1675  }
1676  case bp_hierarchical_bar: {
1677  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1678  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1679  break;
1680  }
1681  case bp_tree_bar: {
1682  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1683  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1684  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1685  break;
1686  }
1687  default: {
1688  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1689  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1690  }
1691  }
1692 
1693  /* From this point on, the team data structure may be deallocated at any time
1694  by the master thread - it is unsafe to reference it in any of the worker
1695  threads. Any per-team data items that need to be referenced before the
1696  end of the barrier should be moved to the kmp_task_team_t structs. */
1697  if (KMP_MASTER_TID(tid)) {
1698  if (__kmp_tasking_mode != tskm_immediate_exec) {
1699  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1700  }
1701 #if OMP_50_ENABLED
1702  if (__kmp_display_affinity) {
1703  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1704  }
1705 #endif
1706 #if KMP_STATS_ENABLED
1707  // Have master thread flag the workers to indicate they are now waiting for
1708  // next parallel region, Also wake them up so they switch their timers to
1709  // idle.
1710  for (int i = 0; i < team->t.t_nproc; ++i) {
1711  kmp_info_t *team_thread = team->t.t_threads[i];
1712  if (team_thread == this_thr)
1713  continue;
1714  team_thread->th.th_stats->setIdleFlag();
1715  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1716  team_thread->th.th_sleep_loc != NULL)
1717  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1718  team_thread->th.th_sleep_loc);
1719  }
1720 #endif
1721 #if USE_ITT_BUILD
1722  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1723  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1724 #endif /* USE_ITT_BUILD */
1725 
1726 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1727  // Join barrier - report frame end
1728  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1729  __kmp_forkjoin_frames_mode &&
1730 #if OMP_40_ENABLED
1731  this_thr->th.th_teams_microtask == NULL &&
1732 #endif
1733  team->t.t_active_level == 1) {
1734  kmp_uint64 cur_time = __itt_get_timestamp();
1735  ident_t *loc = team->t.t_ident;
1736  kmp_info_t **other_threads = team->t.t_threads;
1737  int nproc = this_thr->th.th_team_nproc;
1738  int i;
1739  switch (__kmp_forkjoin_frames_mode) {
1740  case 1:
1741  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1742  loc, nproc);
1743  break;
1744  case 2:
1745  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1746  loc, nproc);
1747  break;
1748  case 3:
1749  if (__itt_metadata_add_ptr) {
1750  // Initialize with master's wait time
1751  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1752  // Set arrive time to zero to be able to check it in
1753  // __kmp_invoke_task(); the same is done inside the loop below
1754  this_thr->th.th_bar_arrive_time = 0;
1755  for (i = 1; i < nproc; ++i) {
1756  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1757  other_threads[i]->th.th_bar_arrive_time = 0;
1758  }
1759  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1760  cur_time, delta, 0);
1761  }
1762  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1763  loc, nproc);
1764  this_thr->th.th_frame_time = cur_time;
1765  break;
1766  }
1767  }
1768 #endif /* USE_ITT_BUILD */
1769  }
1770 #if USE_ITT_BUILD
1771  else {
1772  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1773  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1774  }
1775 #endif /* USE_ITT_BUILD */
1776 
1777 #if KMP_DEBUG
1778  if (KMP_MASTER_TID(tid)) {
1779  KA_TRACE(
1780  15,
1781  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1782  gtid, team_id, tid, nproc));
1783  }
1784 #endif /* KMP_DEBUG */
1785 
1786  // TODO now, mark worker threads as done so they may be disbanded
1787  KMP_MB(); // Flush all pending memory write invalidates.
1788  KA_TRACE(10,
1789  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1790 
1791  ANNOTATE_BARRIER_END(&team->t.t_bar);
1792 }
1793 
1794 // TODO release worker threads' fork barriers as we are ready instead of all at
1795 // once
1796 void __kmp_fork_barrier(int gtid, int tid) {
1797  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1798  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1799  kmp_info_t *this_thr = __kmp_threads[gtid];
1800  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1801 #if USE_ITT_BUILD
1802  void *itt_sync_obj = NULL;
1803 #endif /* USE_ITT_BUILD */
1804  if (team)
1805  ANNOTATE_BARRIER_END(&team->t.t_bar);
1806 
1807  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1808  (team != NULL) ? team->t.t_id : -1, tid));
1809 
1810  // th_team pointer only valid for master thread here
1811  if (KMP_MASTER_TID(tid)) {
1812 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1813  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1814  // Create itt barrier object
1815  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1816  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1817  }
1818 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1819 
1820 #ifdef KMP_DEBUG
1821  kmp_info_t **other_threads = team->t.t_threads;
1822  int i;
1823 
1824  // Verify state
1825  KMP_MB();
1826 
1827  for (i = 1; i < team->t.t_nproc; ++i) {
1828  KA_TRACE(500,
1829  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1830  "== %u.\n",
1831  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1832  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1833  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1834  KMP_DEBUG_ASSERT(
1835  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1836  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1837  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1838  }
1839 #endif
1840 
1841  if (__kmp_tasking_mode != tskm_immediate_exec) {
1842  // 0 indicates setup current task team if nthreads > 1
1843  __kmp_task_team_setup(this_thr, team, 0);
1844  }
1845 
1846  /* The master thread may have changed its blocktime between the join barrier
1847  and the fork barrier. Copy the blocktime info to the thread, where
1848  __kmp_wait_template() can access it when the team struct is not
1849  guaranteed to exist. */
1850  // See note about the corresponding code in __kmp_join_barrier() being
1851  // performance-critical
1852  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1853 #if KMP_USE_MONITOR
1854  this_thr->th.th_team_bt_intervals =
1855  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1856  this_thr->th.th_team_bt_set =
1857  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1858 #else
1859  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1860 #endif
1861  }
1862  } // master
1863 
1864  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1865  case bp_hyper_bar: {
1866  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1867  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1868  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1869  break;
1870  }
1871  case bp_hierarchical_bar: {
1872  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1873  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1874  break;
1875  }
1876  case bp_tree_bar: {
1877  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1878  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1879  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1880  break;
1881  }
1882  default: {
1883  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1884  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1885  }
1886  }
1887 
1888 #if OMPT_SUPPORT
1889  if (ompt_enabled.enabled &&
1890  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1891  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1892  ompt_data_t *task_data = (team)
1893  ? OMPT_CUR_TASK_DATA(this_thr)
1894  : &(this_thr->th.ompt_thread_info.task_data);
1895  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1896 #if OMPT_OPTIONAL
1897  void *codeptr = NULL;
1898  if (KMP_MASTER_TID(ds_tid) &&
1899  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1900  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1901  codeptr = team->t.ompt_team_info.master_return_address;
1902  if (ompt_enabled.ompt_callback_sync_region_wait) {
1903  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1904  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
1905  }
1906  if (ompt_enabled.ompt_callback_sync_region) {
1907  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1908  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
1909  }
1910 #endif
1911  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
1912  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1913  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1914  }
1915  }
1916 #endif
1917 
1918  // Early exit for reaping threads releasing forkjoin barrier
1919  if (TCR_4(__kmp_global.g.g_done)) {
1920  this_thr->th.th_task_team = NULL;
1921 
1922 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1923  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1924  if (!KMP_MASTER_TID(tid)) {
1925  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1926  if (itt_sync_obj)
1927  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1928  }
1929  }
1930 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1931  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1932  return;
1933  }
1934 
1935  /* We can now assume that a valid team structure has been allocated by the
1936  master and propagated to all worker threads. The current thread, however,
1937  may not be part of the team, so we can't blindly assume that the team
1938  pointer is non-null. */
1939  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1940  KMP_DEBUG_ASSERT(team != NULL);
1941  tid = __kmp_tid_from_gtid(gtid);
1942 
1943 #if KMP_BARRIER_ICV_PULL
1944  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1945  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
1946  implicit task has this data before this function is called. We cannot
1947  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
1948  struct, because it is not always the case that the threads arrays have
1949  been allocated when __kmp_fork_call() is executed. */
1950  {
1951  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1952  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1953  // Copy the initial ICVs from the master's thread struct to the implicit
1954  // task for this tid.
1955  KA_TRACE(10,
1956  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1957  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
1958  tid, FALSE);
1959  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1960  &team->t.t_threads[0]
1961  ->th.th_bar[bs_forkjoin_barrier]
1962  .bb.th_fixed_icvs);
1963  }
1964  }
1965 #endif // KMP_BARRIER_ICV_PULL
1966 
1967  if (__kmp_tasking_mode != tskm_immediate_exec) {
1968  __kmp_task_team_sync(this_thr, team);
1969  }
1970 
1971 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1972  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1973  if (proc_bind == proc_bind_intel) {
1974 #endif
1975 #if KMP_AFFINITY_SUPPORTED
1976  // Call dynamic affinity settings
1977  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1978  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
1979  }
1980 #endif // KMP_AFFINITY_SUPPORTED
1981 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1982  } else if (proc_bind != proc_bind_false) {
1983  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1984  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1985  __kmp_gtid_from_thread(this_thr),
1986  this_thr->th.th_current_place));
1987  } else {
1988  __kmp_affinity_set_place(gtid);
1989  }
1990  }
1991 #endif
1992 #if OMP_50_ENABLED
1993  // Perform the display affinity functionality
1994  if (__kmp_display_affinity) {
1995  if (team->t.t_display_affinity
1996 #if KMP_AFFINITY_SUPPORTED
1997  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
1998 #endif
1999  ) {
2000  // NULL means use the affinity-format-var ICV
2001  __kmp_aux_display_affinity(gtid, NULL);
2002  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2003  this_thr->th.th_prev_level = team->t.t_level;
2004  }
2005  }
2006  if (!KMP_MASTER_TID(tid))
2007  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2008 #endif
2009 
2010 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2011  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2012  if (!KMP_MASTER_TID(tid)) {
2013  // Get correct barrier object
2014  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2015  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2016  } // (prepare called inside barrier_release)
2017  }
2018 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2019  ANNOTATE_BARRIER_END(&team->t.t_bar);
2020  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2021  team->t.t_id, tid));
2022 }
2023 
2024 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2025  kmp_internal_control_t *new_icvs, ident_t *loc) {
2026  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2027 
2028  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2029  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2030 
2031 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2032  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2033  implicit task has this data before this function is called. */
2034 #if KMP_BARRIER_ICV_PULL
2035  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2036  untouched), where all of the worker threads can access them and make their
2037  own copies after the barrier. */
2038  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2039  // allocated at this point
2040  copy_icvs(
2041  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2042  new_icvs);
2043  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2044  team->t.t_threads[0], team));
2045 #elif KMP_BARRIER_ICV_PUSH
2046  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2047  // done here.
2048  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2049  team->t.t_threads[0], team));
2050 #else
2051  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2052  // time.
2053  ngo_load(new_icvs);
2054  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2055  // allocated at this point
2056  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2057  // TODO: GEH - pass in better source location info since usually NULL here
2058  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2059  f, team->t.t_threads[f], team));
2060  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2061  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2062  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2063  f, team->t.t_threads[f], team));
2064  }
2065  ngo_sync();
2066 #endif // KMP_BARRIER_ICV_PULL
2067 }
Definition: kmp.h:224