LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_stats.h"
19 #include "kmp_itt.h"
20 #include "kmp_os.h"
21 
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #if KMP_MIC && USE_NGO_STORES
29 // ICV copying
30 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
31 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
34 #else
35 #define ngo_load(src) ((void)0)
36 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
37 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
38 #define ngo_sync() ((void)0)
39 #endif /* KMP_MIC && USE_NGO_STORES */
40 
41 void __kmp_print_structure(void); // Forward declaration
42 
43 // ---------------------------- Barrier Algorithms ----------------------------
44 
45 // Linear Barrier
46 static void
47 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48  void (*reduce)(void *, void *)
49  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
50 {
51  KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
52  register kmp_team_t *team = this_thr->th.th_team;
53  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
54  register kmp_info_t **other_threads = team->t.t_threads;
55 
56  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57  gtid, team->t.t_id, tid, bt));
58  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61  // Barrier imbalance - save arrive time to the thread
62  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
64  }
65 #endif
66  // We now perform a linear reduction to signal that all of the threads have arrived.
67  if (!KMP_MASTER_TID(tid)) {
68  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
69  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
70  __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
71  thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
72  // Mark arrival to master thread
73  /* After performing this write, a worker thread may not assume that the team is valid
74  any more - it could be deallocated by the master thread at any time. */
75  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
76  flag.release();
77  } else {
78  register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
79  register int nproc = this_thr->th.th_team_nproc;
80  register int i;
81  // Don't have to worry about sleep bit here or atomic since team setting
82  register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
83 
84  // Collect all the worker team member threads.
85  for (i=1; i<nproc; ++i) {
86 #if KMP_CACHE_MANAGE
87  // Prefetch next thread's arrived count
88  if (i+1 < nproc)
89  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
90 #endif /* KMP_CACHE_MANAGE */
91  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
92  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
93  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
94  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
95 
96  // Wait for worker thread to arrive
97  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
98  flag.wait(this_thr, FALSE
99  USE_ITT_BUILD_ARG(itt_sync_obj) );
100 #if USE_ITT_BUILD && USE_ITT_NOTIFY
101  // Barrier imbalance - write min of the thread time and the other thread time to the thread.
102  if (__kmp_forkjoin_frames_mode == 2) {
103  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
104  other_threads[i]->th.th_bar_min_time);
105  }
106 #endif
107  if (reduce) {
108  KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
109  team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
110  (*reduce)(this_thr->th.th_local.reduce_data,
111  other_threads[i]->th.th_local.reduce_data);
112  }
113  }
114  // Don't have to worry about sleep bit here or atomic since team setting
115  team_bar->b_arrived = new_state;
116  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
117  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
118  }
119  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
120  gtid, team->t.t_id, tid, bt));
121 }
122 
123 static void
124 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
125  int propagate_icvs
126  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
127 {
128  KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
129  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
130  register kmp_team_t *team;
131 
132  if (KMP_MASTER_TID(tid)) {
133  register unsigned int i;
134  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
135  register kmp_info_t **other_threads;
136 
137  team = __kmp_threads[gtid]->th.th_team;
138  KMP_DEBUG_ASSERT(team != NULL);
139  other_threads = team->t.t_threads;
140 
141  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
142  gtid, team->t.t_id, tid, bt));
143 
144  if (nproc > 1) {
145 #if KMP_BARRIER_ICV_PUSH
146  {
147  KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
148  if (propagate_icvs) {
149  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
150  for (i=1; i<nproc; ++i) {
151  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
152  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
153  &team->t.t_implicit_task_taskdata[0].td_icvs);
154  }
155  ngo_sync();
156  }
157  }
158 #endif // KMP_BARRIER_ICV_PUSH
159 
160  // Now, release all of the worker threads
161  for (i=1; i<nproc; ++i) {
162 #if KMP_CACHE_MANAGE
163  // Prefetch next thread's go flag
164  if (i+1 < nproc)
165  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
166 #endif /* KMP_CACHE_MANAGE */
167  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
168  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
169  other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
170  &other_threads[i]->th.th_bar[bt].bb.b_go,
171  other_threads[i]->th.th_bar[bt].bb.b_go,
172  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
173  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
174  flag.release();
175  }
176  }
177  } else { // Wait for the MASTER thread to release us
178  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
179  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
180  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
181  flag.wait(this_thr, TRUE
182  USE_ITT_BUILD_ARG(itt_sync_obj) );
183 #if USE_ITT_BUILD && USE_ITT_NOTIFY
184  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
185  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
186  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
187  // Cancel wait on previous parallel region...
188  __kmp_itt_task_starting(itt_sync_obj);
189 
190  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
191  return;
192 
193  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
194  if (itt_sync_obj != NULL)
195  // Call prepare as early as possible for "new" barrier
196  __kmp_itt_task_finished(itt_sync_obj);
197  } else
198 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
199  // Early exit for reaping threads releasing forkjoin barrier
200  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
201  return;
202  // The worker thread may now assume that the team is valid.
203 #ifdef KMP_DEBUG
204  tid = __kmp_tid_from_gtid(gtid);
205  team = __kmp_threads[gtid]->th.th_team;
206 #endif
207  KMP_DEBUG_ASSERT(team != NULL);
208  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
209  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
210  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
211  KMP_MB(); // Flush all pending memory write invalidates.
212  }
213  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
214  gtid, team->t.t_id, tid, bt));
215 }
216 
217 // Tree barrier
218 static void
219 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
220  void (*reduce)(void *, void *)
221  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
222 {
223  KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
224  register kmp_team_t *team = this_thr->th.th_team;
225  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
226  register kmp_info_t **other_threads = team->t.t_threads;
227  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
228  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
229  register kmp_uint32 branch_factor = 1 << branch_bits;
230  register kmp_uint32 child;
231  register kmp_uint32 child_tid;
232  register kmp_uint64 new_state;
233 
234  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
235  gtid, team->t.t_id, tid, bt));
236  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
237 
238 #if USE_ITT_BUILD && USE_ITT_NOTIFY
239  // Barrier imbalance - save arrive time to the thread
240  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
241  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
242  }
243 #endif
244  // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
245  child_tid = (tid << branch_bits) + 1;
246  if (child_tid < nproc) {
247  // Parent threads wait for all their children to arrive
248  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
249  child = 1;
250  do {
251  register kmp_info_t *child_thr = other_threads[child_tid];
252  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
253 #if KMP_CACHE_MANAGE
254  // Prefetch next thread's arrived count
255  if (child+1 <= branch_factor && child_tid+1 < nproc)
256  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
257 #endif /* KMP_CACHE_MANAGE */
258  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
259  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
260  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
261  &child_bar->b_arrived, new_state));
262  // Wait for child to arrive
263  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
264  flag.wait(this_thr, FALSE
265  USE_ITT_BUILD_ARG(itt_sync_obj) );
266 #if USE_ITT_BUILD && USE_ITT_NOTIFY
267  // Barrier imbalance - write min of the thread time and a child time to the thread.
268  if (__kmp_forkjoin_frames_mode == 2) {
269  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
270  child_thr->th.th_bar_min_time);
271  }
272 #endif
273  if (reduce) {
274  KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
275  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
276  team->t.t_id, child_tid));
277  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
278  }
279  child++;
280  child_tid++;
281  }
282  while (child <= branch_factor && child_tid < nproc);
283  }
284 
285  if (!KMP_MASTER_TID(tid)) { // Worker threads
286  register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
287 
288  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
289  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
290  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
291  &thr_bar->b_arrived, thr_bar->b_arrived,
292  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
293 
294  // Mark arrival to parent thread
295  /* After performing this write, a worker thread may not assume that the team is valid
296  any more - it could be deallocated by the master thread at any time. */
297  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
298  flag.release();
299  } else {
300  // Need to update the team arrived pointer if we are the master thread
301  if (nproc > 1) // New value was already computed above
302  team->t.t_bar[bt].b_arrived = new_state;
303  else
304  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
305  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
306  gtid, team->t.t_id, tid, team->t.t_id,
307  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
308  }
309  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
310  gtid, team->t.t_id, tid, bt));
311 }
312 
313 static void
314 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
315  int propagate_icvs
316  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
317 {
318  KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
319  register kmp_team_t *team;
320  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
321  register kmp_uint32 nproc;
322  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
323  register kmp_uint32 branch_factor = 1 << branch_bits;
324  register kmp_uint32 child;
325  register kmp_uint32 child_tid;
326 
327  // Perform a tree release for all of the threads that have been gathered
328  if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
329  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
330  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
331  // Wait for parent thread to release us
332  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
333  flag.wait(this_thr, TRUE
334  USE_ITT_BUILD_ARG(itt_sync_obj) );
335 #if USE_ITT_BUILD && USE_ITT_NOTIFY
336  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
337  // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
338  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
339  // Cancel wait on previous parallel region...
340  __kmp_itt_task_starting(itt_sync_obj);
341 
342  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
343  return;
344 
345  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
346  if (itt_sync_obj != NULL)
347  // Call prepare as early as possible for "new" barrier
348  __kmp_itt_task_finished(itt_sync_obj);
349  } else
350 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
351  // Early exit for reaping threads releasing forkjoin barrier
352  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
353  return;
354 
355  // The worker thread may now assume that the team is valid.
356  team = __kmp_threads[gtid]->th.th_team;
357  KMP_DEBUG_ASSERT(team != NULL);
358  tid = __kmp_tid_from_gtid(gtid);
359 
360  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
361  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
362  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
363  KMP_MB(); // Flush all pending memory write invalidates.
364  } else {
365  team = __kmp_threads[gtid]->th.th_team;
366  KMP_DEBUG_ASSERT(team != NULL);
367  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
368  gtid, team->t.t_id, tid, bt));
369  }
370  nproc = this_thr->th.th_team_nproc;
371  child_tid = (tid << branch_bits) + 1;
372 
373  if (child_tid < nproc) {
374  register kmp_info_t **other_threads = team->t.t_threads;
375  child = 1;
376  // Parent threads release all their children
377  do {
378  register kmp_info_t *child_thr = other_threads[child_tid];
379  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
380 #if KMP_CACHE_MANAGE
381  // Prefetch next thread's go count
382  if (child+1 <= branch_factor && child_tid+1 < nproc)
383  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
384 #endif /* KMP_CACHE_MANAGE */
385 
386 #if KMP_BARRIER_ICV_PUSH
387  {
388  KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
389  if (propagate_icvs) {
390  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
391  team, child_tid, FALSE);
392  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
393  &team->t.t_implicit_task_taskdata[0].td_icvs);
394  }
395  }
396 #endif // KMP_BARRIER_ICV_PUSH
397  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
398  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
399  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
400  child_tid, &child_bar->b_go, child_bar->b_go,
401  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
402  // Release child from barrier
403  kmp_flag_64 flag(&child_bar->b_go, child_thr);
404  flag.release();
405  child++;
406  child_tid++;
407  }
408  while (child <= branch_factor && child_tid < nproc);
409  }
410  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
411  gtid, team->t.t_id, tid, bt));
412 }
413 
414 
415 // Hyper Barrier
416 static void
417 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
418  void (*reduce)(void *, void *)
419  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
420 {
421  KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
422  register kmp_team_t *team = this_thr->th.th_team;
423  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
424  register kmp_info_t **other_threads = team->t.t_threads;
425  register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
426  register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
427  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
428  register kmp_uint32 branch_factor = 1 << branch_bits;
429  register kmp_uint32 offset;
430  register kmp_uint32 level;
431 
432  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
433  gtid, team->t.t_id, tid, bt));
434 
435  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
436 
437 #if USE_ITT_BUILD && USE_ITT_NOTIFY
438  // Barrier imbalance - save arrive time to the thread
439  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
440  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
441  }
442 #endif
443  /* Perform a hypercube-embedded tree gather to wait until all of the threads have
444  arrived, and reduce any required data as we go. */
445  kmp_flag_64 p_flag(&thr_bar->b_arrived);
446  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
447  {
448  register kmp_uint32 child;
449  register kmp_uint32 child_tid;
450 
451  if (((tid >> level) & (branch_factor - 1)) != 0) {
452  register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
453 
454  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
455  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
456  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
457  &thr_bar->b_arrived, thr_bar->b_arrived,
458  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
459  // Mark arrival to parent thread
460  /* After performing this write (in the last iteration of the enclosing for loop),
461  a worker thread may not assume that the team is valid any more - it could be
462  deallocated by the master thread at any time. */
463  p_flag.set_waiter(other_threads[parent_tid]);
464  p_flag.release();
465  break;
466  }
467 
468  // Parent threads wait for children to arrive
469  if (new_state == KMP_BARRIER_UNUSED_STATE)
470  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
471  for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
472  child++, child_tid+=(1 << level))
473  {
474  register kmp_info_t *child_thr = other_threads[child_tid];
475  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
476 #if KMP_CACHE_MANAGE
477  register kmp_uint32 next_child_tid = child_tid + (1 << level);
478  // Prefetch next thread's arrived count
479  if (child+1 < branch_factor && next_child_tid < num_threads)
480  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
481 #endif /* KMP_CACHE_MANAGE */
482  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
483  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
484  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
485  &child_bar->b_arrived, new_state));
486  // Wait for child to arrive
487  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
488  c_flag.wait(this_thr, FALSE
489  USE_ITT_BUILD_ARG(itt_sync_obj) );
490 #if USE_ITT_BUILD && USE_ITT_NOTIFY
491  // Barrier imbalance - write min of the thread time and a child time to the thread.
492  if (__kmp_forkjoin_frames_mode == 2) {
493  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
494  child_thr->th.th_bar_min_time);
495  }
496 #endif
497  if (reduce) {
498  KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
499  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
500  team->t.t_id, child_tid));
501  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
502  }
503  }
504  }
505 
506  if (KMP_MASTER_TID(tid)) {
507  // Need to update the team arrived pointer if we are the master thread
508  if (new_state == KMP_BARRIER_UNUSED_STATE)
509  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
510  else
511  team->t.t_bar[bt].b_arrived = new_state;
512  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
513  gtid, team->t.t_id, tid, team->t.t_id,
514  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
515  }
516  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
517  gtid, team->t.t_id, tid, bt));
518 }
519 
520 // The reverse versions seem to beat the forward versions overall
521 #define KMP_REVERSE_HYPER_BAR
522 static void
523 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
524  int propagate_icvs
525  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
526 {
527  KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
528  register kmp_team_t *team;
529  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
530  register kmp_info_t **other_threads;
531  register kmp_uint32 num_threads;
532  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
533  register kmp_uint32 branch_factor = 1 << branch_bits;
534  register kmp_uint32 child;
535  register kmp_uint32 child_tid;
536  register kmp_uint32 offset;
537  register kmp_uint32 level;
538 
539  /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
540  If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
541  order of the corresponding gather, otherwise threads are released in the same order. */
542  if (KMP_MASTER_TID(tid)) { // master
543  team = __kmp_threads[gtid]->th.th_team;
544  KMP_DEBUG_ASSERT(team != NULL);
545  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
546  gtid, team->t.t_id, tid, bt));
547 #if KMP_BARRIER_ICV_PUSH
548  if (propagate_icvs) { // master already has ICVs in final destination; copy
549  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
550  }
551 #endif
552  }
553  else { // Handle fork barrier workers who aren't part of a team yet
554  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
555  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
556  // Wait for parent thread to release us
557  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
558  flag.wait(this_thr, TRUE
559  USE_ITT_BUILD_ARG(itt_sync_obj) );
560 #if USE_ITT_BUILD && USE_ITT_NOTIFY
561  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
562  // In fork barrier where we could not get the object reliably
563  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
564  // Cancel wait on previous parallel region...
565  __kmp_itt_task_starting(itt_sync_obj);
566 
567  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
568  return;
569 
570  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
571  if (itt_sync_obj != NULL)
572  // Call prepare as early as possible for "new" barrier
573  __kmp_itt_task_finished(itt_sync_obj);
574  } else
575 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
576  // Early exit for reaping threads releasing forkjoin barrier
577  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
578  return;
579 
580  // The worker thread may now assume that the team is valid.
581  team = __kmp_threads[gtid]->th.th_team;
582  KMP_DEBUG_ASSERT(team != NULL);
583  tid = __kmp_tid_from_gtid(gtid);
584 
585  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
586  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
587  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
588  KMP_MB(); // Flush all pending memory write invalidates.
589  }
590  num_threads = this_thr->th.th_team_nproc;
591  other_threads = team->t.t_threads;
592 
593 #ifdef KMP_REVERSE_HYPER_BAR
594  // Count up to correct level for parent
595  for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
596  level+=branch_bits, offset<<=branch_bits);
597 
598  // Now go down from there
599  for (level-=branch_bits, offset>>=branch_bits; offset != 0;
600  level-=branch_bits, offset>>=branch_bits)
601 #else
602  // Go down the tree, level by level
603  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
604 #endif // KMP_REVERSE_HYPER_BAR
605  {
606 #ifdef KMP_REVERSE_HYPER_BAR
607  /* Now go in reverse order through the children, highest to lowest.
608  Initial setting of child is conservative here. */
609  child = num_threads >> ((level==0)?level:level-1);
610  for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
611  child>=1; child--, child_tid-=(1<<level))
612 #else
613  if (((tid >> level) & (branch_factor - 1)) != 0)
614  // No need to go lower than this, since this is the level parent would be notified
615  break;
616  // Iterate through children on this level of the tree
617  for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
618  child++, child_tid+=(1<<level))
619 #endif // KMP_REVERSE_HYPER_BAR
620  {
621  if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
622  else {
623  register kmp_info_t *child_thr = other_threads[child_tid];
624  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
625 #if KMP_CACHE_MANAGE
626  register kmp_uint32 next_child_tid = child_tid - (1 << level);
627  // Prefetch next thread's go count
628 # ifdef KMP_REVERSE_HYPER_BAR
629  if (child-1 >= 1 && next_child_tid < num_threads)
630 # else
631  if (child+1 < branch_factor && next_child_tid < num_threads)
632 # endif // KMP_REVERSE_HYPER_BAR
633  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
634 #endif /* KMP_CACHE_MANAGE */
635 
636 #if KMP_BARRIER_ICV_PUSH
637  if (propagate_icvs) // push my fixed ICVs to my child
638  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
639 #endif // KMP_BARRIER_ICV_PUSH
640 
641  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
642  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
643  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
644  child_tid, &child_bar->b_go, child_bar->b_go,
645  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
646  // Release child from barrier
647  kmp_flag_64 flag(&child_bar->b_go, child_thr);
648  flag.release();
649  }
650  }
651  }
652 #if KMP_BARRIER_ICV_PUSH
653  if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
654  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
655  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
656  }
657 #endif
658  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
659  gtid, team->t.t_id, tid, bt));
660 }
661 
662 // Hierarchical Barrier
663 
664 // Initialize thread barrier data
665 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
666  minimum amount of initialization required based on how the team has changed. Returns true if
667  leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
668  team size increases, threads already in the team will respond to on-core wakeup on their parent
669  thread, but threads newly added to the team will only be listening on the their local b_go. */
670 static bool
671 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
672  int gtid, int tid, kmp_team_t *team)
673 {
674  // Checks to determine if (re-)initialization is needed
675  bool uninitialized = thr_bar->team == NULL;
676  bool team_changed = team != thr_bar->team;
677  bool team_sz_changed = nproc != thr_bar->nproc;
678  bool tid_changed = tid != thr_bar->old_tid;
679  bool retval = false;
680 
681  if (uninitialized || team_sz_changed) {
682  __kmp_get_hierarchy(nproc, thr_bar);
683  }
684 
685  if (uninitialized || team_sz_changed || tid_changed) {
686  thr_bar->my_level = thr_bar->depth-1; // default for master
687  thr_bar->parent_tid = -1; // default for master
688  if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
689  kmp_uint32 d=0;
690  while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
691  kmp_uint32 rem;
692  if (d == thr_bar->depth-2) { // reached level right below the master
693  thr_bar->parent_tid = 0;
694  thr_bar->my_level = d;
695  break;
696  }
697  else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
698  // thread is not a subtree root at next level, so this is max
699  thr_bar->parent_tid = tid - rem;
700  thr_bar->my_level = d;
701  break;
702  }
703  ++d;
704  }
705  }
706  thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
707  thr_bar->old_tid = tid;
708  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
709  thr_bar->team = team;
710  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
711  }
712  if (uninitialized || team_changed || tid_changed) {
713  thr_bar->team = team;
714  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
715  retval = true;
716  }
717  if (uninitialized || team_sz_changed || tid_changed) {
718  thr_bar->nproc = nproc;
719  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
720  if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
721  if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
722  thr_bar->leaf_kids = nproc - tid - 1;
723  thr_bar->leaf_state = 0;
724  for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
725  }
726  return retval;
727 }
728 
729 static void
730 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
731  int gtid, int tid, void (*reduce) (void *, void *)
732  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
733 {
734  KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
735  register kmp_team_t *team = this_thr->th.th_team;
736  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
737  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
738  register kmp_info_t **other_threads = team->t.t_threads;
739  register kmp_uint64 new_state;
740 
741  int level = team->t.t_level;
742 #if OMP_40_ENABLED
743  if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
744  if (this_thr->th.th_teams_size.nteams > 1)
745  ++level; // level was not increased in teams construct for team_of_masters
746 #endif
747  if (level == 1) thr_bar->use_oncore_barrier = 1;
748  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
749 
750  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
751  gtid, team->t.t_id, tid, bt));
752  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
753 
754 #if USE_ITT_BUILD && USE_ITT_NOTIFY
755  // Barrier imbalance - save arrive time to the thread
756  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
757  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
758  }
759 #endif
760 
761  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
762 
763  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
764  register kmp_int32 child_tid;
765  new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
766  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
767  if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
768  kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
769  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
770  gtid, team->t.t_id, tid));
771  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
772  flag.wait(this_thr, FALSE
773  USE_ITT_BUILD_ARG(itt_sync_obj) );
774  if (reduce) {
775  for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
776  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
777  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
778  team->t.t_id, child_tid));
779  (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
780  }
781  }
782  (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
783  }
784  // Next, wait for higher level children on each child's b_arrived flag
785  for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
786  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
787  if (last > nproc) last = nproc;
788  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
789  register kmp_info_t *child_thr = other_threads[child_tid];
790  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
791  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
792  "arrived(%p) == %llu\n",
793  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
794  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
795  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
796  flag.wait(this_thr, FALSE
797  USE_ITT_BUILD_ARG(itt_sync_obj) );
798  if (reduce) {
799  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
800  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
801  team->t.t_id, child_tid));
802  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
803  }
804  }
805  }
806  }
807  else { // Blocktime is not infinite
808  for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
809  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
810  if (last > nproc) last = nproc;
811  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
812  register kmp_info_t *child_thr = other_threads[child_tid];
813  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
814  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
815  "arrived(%p) == %llu\n",
816  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
817  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
818  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
819  flag.wait(this_thr, FALSE
820  USE_ITT_BUILD_ARG(itt_sync_obj) );
821  if (reduce) {
822  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
823  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
824  team->t.t_id, child_tid));
825  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
826  }
827  }
828  }
829  }
830  }
831  // All subordinates are gathered; now release parent if not master thread
832 
833  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
834  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
835  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
836  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
837  &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
838  /* Mark arrival to parent: After performing this write, a worker thread may not assume that
839  the team is valid any more - it could be deallocated by the master thread at any time. */
840  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
841  || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
842  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
843  flag.release();
844  }
845  else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
846  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
847  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
848  flag.set_waiter(other_threads[thr_bar->parent_tid]);
849  flag.release();
850  }
851  } else { // Master thread needs to update the team's b_arrived value
852  team->t.t_bar[bt].b_arrived = new_state;
853  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
854  gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
855  }
856  // Is the team access below unsafe or just technically invalid?
857  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
858  gtid, team->t.t_id, tid, bt));
859 }
860 
861 static void
862 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
863  int propagate_icvs
864  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
865 {
866  KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
867  register kmp_team_t *team;
868  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
869  register kmp_uint32 nproc;
870  bool team_change = false; // indicates on-core barrier shouldn't be used
871 
872  if (KMP_MASTER_TID(tid)) {
873  team = __kmp_threads[gtid]->th.th_team;
874  KMP_DEBUG_ASSERT(team != NULL);
875  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
876  gtid, team->t.t_id, tid, bt));
877  }
878  else { // Worker threads
879  // Wait for parent thread to release me
880  if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
881  || thr_bar->my_level != 0 || thr_bar->team == NULL) {
882  // Use traditional method of waiting on my own b_go flag
883  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
884  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
885  flag.wait(this_thr, TRUE
886  USE_ITT_BUILD_ARG(itt_sync_obj) );
887  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
888  }
889  else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
890  // Wait on my "offset" bits on parent's b_go flag
891  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
892  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
893  bt, this_thr
894  USE_ITT_BUILD_ARG(itt_sync_obj) );
895  flag.wait(this_thr, TRUE);
896  if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
897  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
898  }
899  else { // Reset my bits on parent's b_go flag
900  ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
901  }
902  }
903  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
904  // Early exit for reaping threads releasing forkjoin barrier
905  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
906  return;
907  // The worker thread may now assume that the team is valid.
908  team = __kmp_threads[gtid]->th.th_team;
909  KMP_DEBUG_ASSERT(team != NULL);
910  tid = __kmp_tid_from_gtid(gtid);
911 
912  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
913  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
914  KMP_MB(); // Flush all pending memory write invalidates.
915  }
916 
917  nproc = this_thr->th.th_team_nproc;
918  int level = team->t.t_level;
919 #if OMP_40_ENABLED
920  if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
921  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
922  ++level; // level was not increased in teams construct for team_of_workers
923  if( this_thr->th.th_teams_size.nteams > 1 )
924  ++level; // level was not increased in teams construct for team_of_masters
925  }
926 #endif
927  if (level == 1) thr_bar->use_oncore_barrier = 1;
928  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
929 
930  // If the team size has increased, we still communicate with old leaves via oncore barrier.
931  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
932  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
933  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
934  // But if the entire team changes, we won't use oncore barrier at all
935  if (team_change) old_leaf_kids = 0;
936 
937 #if KMP_BARRIER_ICV_PUSH
938  if (propagate_icvs) {
939  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
940  if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
941  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
942  }
943  else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
944  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
945  // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
946  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
947  &thr_bar->parent_bar->th_fixed_icvs);
948  // non-leaves will get ICVs piggybacked with b_go via NGO store
949  }
950  else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
951  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
952  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
953  else // leaves copy parent's fixed ICVs directly to local ICV store
954  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
955  &thr_bar->parent_bar->th_fixed_icvs);
956  }
957  }
958 #endif // KMP_BARRIER_ICV_PUSH
959 
960  // Now, release my children
961  if (thr_bar->my_level) { // not a leaf
962  register kmp_int32 child_tid;
963  kmp_uint32 last;
964  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
965  if (KMP_MASTER_TID(tid)) { // do a flat release
966  // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
967  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
968  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
969  ngo_load(&thr_bar->th_fixed_icvs);
970  // This loops over all the threads skipping only the leaf nodes in the hierarchy
971  for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
972  register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
973  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
974  " go(%p): %u => %u\n",
975  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
976  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
977  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
978  // Use ngo store (if available) to both store ICVs and release child via child's b_go
979  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
980  }
981  ngo_sync();
982  }
983  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
984  // Now, release leaf children
985  if (thr_bar->leaf_kids) { // if there are any
986  // We test team_change on the off-chance that the level 1 team changed.
987  if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
988  if (old_leaf_kids) { // release old leaf kids
989  thr_bar->b_go |= old_leaf_state;
990  }
991  // Release new leaf kids
992  last = tid+thr_bar->skip_per_level[1];
993  if (last > nproc) last = nproc;
994  for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
995  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
996  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
997  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
998  " T#%d(%d:%d) go(%p): %u => %u\n",
999  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1000  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1001  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1002  // Release child using child's b_go flag
1003  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1004  flag.release();
1005  }
1006  }
1007  else { // Release all children at once with leaf_state bits on my own b_go flag
1008  thr_bar->b_go |= thr_bar->leaf_state;
1009  }
1010  }
1011  }
1012  else { // Blocktime is not infinite; do a simple hierarchical release
1013  for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1014  last = tid+thr_bar->skip_per_level[d+1];
1015  kmp_uint32 skip = thr_bar->skip_per_level[d];
1016  if (last > nproc) last = nproc;
1017  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1018  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1019  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1020  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1021  " go(%p): %u => %u\n",
1022  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1023  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1024  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1025  // Release child using child's b_go flag
1026  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1027  flag.release();
1028  }
1029  }
1030  }
1031 #if KMP_BARRIER_ICV_PUSH
1032  if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1033  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1034 #endif // KMP_BARRIER_ICV_PUSH
1035  }
1036  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1037  gtid, team->t.t_id, tid, bt));
1038 }
1039 
1040 // ---------------------------- End of Barrier Algorithms ----------------------------
1041 
1042 // Internal function to do a barrier.
1043 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1044  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1045  Returns 0 if master thread, 1 if worker thread. */
1046 int
1047 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1048  void *reduce_data, void (*reduce)(void *, void *))
1049 {
1050  KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
1051  register int tid = __kmp_tid_from_gtid(gtid);
1052  register kmp_info_t *this_thr = __kmp_threads[gtid];
1053  register kmp_team_t *team = this_thr->th.th_team;
1054  register int status = 0;
1055  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1056 #if OMPT_SUPPORT
1057  ompt_task_id_t my_task_id;
1058  ompt_parallel_id_t my_parallel_id;
1059 #endif
1060 
1061  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1062  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1063 
1064 #if OMPT_SUPPORT
1065  if (ompt_enabled) {
1066 #if OMPT_BLAME
1067  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1068  my_parallel_id = team->t.ompt_team_info.parallel_id;
1069 
1070 #if OMPT_TRACE
1071  if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1072  if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1073  ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1074  my_parallel_id, my_task_id);
1075  }
1076  }
1077 #endif
1078  if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1079  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1080  my_parallel_id, my_task_id);
1081  }
1082 #endif
1083  // It is OK to report the barrier state after the barrier begin callback.
1084  // According to the OMPT specification, a compliant implementation may
1085  // even delay reporting this state until the barrier begins to wait.
1086  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1087  }
1088 #endif
1089 
1090  if (! team->t.t_serialized) {
1091 #if USE_ITT_BUILD
1092  // This value will be used in itt notify events below.
1093  void *itt_sync_obj = NULL;
1094 # if USE_ITT_NOTIFY
1095  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1096  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1097 # endif
1098 #endif /* USE_ITT_BUILD */
1099  if (__kmp_tasking_mode == tskm_extra_barrier) {
1100  __kmp_tasking_barrier(team, this_thr, gtid);
1101  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1102  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1103  }
1104 
1105  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1106  the team struct is not guaranteed to exist. */
1107  // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1108  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1109  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1110  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1111  }
1112 
1113 #if USE_ITT_BUILD
1114  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1115  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1116 #endif /* USE_ITT_BUILD */
1117 #if USE_DEBUGGER
1118  // Let the debugger know: the thread arrived to the barrier and waiting.
1119  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1120  team->t.t_bar[bt].b_master_arrived += 1;
1121  } else {
1122  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1123  } // if
1124 #endif /* USE_DEBUGGER */
1125  if (reduce != NULL) {
1126  //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1127  this_thr->th.th_local.reduce_data = reduce_data;
1128  }
1129 
1130  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1131  __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1
1132 
1133  switch (__kmp_barrier_gather_pattern[bt]) {
1134  case bp_hyper_bar: {
1135  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1136  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1137  USE_ITT_BUILD_ARG(itt_sync_obj) );
1138  break;
1139  }
1140  case bp_hierarchical_bar: {
1141  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1142  USE_ITT_BUILD_ARG(itt_sync_obj));
1143  break;
1144  }
1145  case bp_tree_bar: {
1146  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1147  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1148  USE_ITT_BUILD_ARG(itt_sync_obj) );
1149  break;
1150  }
1151  default: {
1152  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1153  USE_ITT_BUILD_ARG(itt_sync_obj) );
1154  }
1155  }
1156 
1157  KMP_MB();
1158 
1159  if (KMP_MASTER_TID(tid)) {
1160  status = 0;
1161  if (__kmp_tasking_mode != tskm_immediate_exec) {
1162  __kmp_task_team_wait(this_thr, team
1163  USE_ITT_BUILD_ARG(itt_sync_obj) );
1164  }
1165 #if USE_DEBUGGER
1166  // Let the debugger know: All threads are arrived and starting leaving the barrier.
1167  team->t.t_bar[bt].b_team_arrived += 1;
1168 #endif
1169 
1170 #if USE_ITT_BUILD
1171  /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1172  before the final summation into the shared variable is done (final summation can be a
1173  long operation for array reductions). */
1174  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1175  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1176 #endif /* USE_ITT_BUILD */
1177 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1178  // Barrier - report frame end (only if active_level == 1)
1179  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1180 #if OMP_40_ENABLED
1181  this_thr->th.th_teams_microtask == NULL &&
1182 #endif
1183  team->t.t_active_level == 1)
1184  {
1185  kmp_uint64 cur_time = __itt_get_timestamp();
1186  kmp_info_t **other_threads = team->t.t_threads;
1187  int nproc = this_thr->th.th_team_nproc;
1188  int i;
1189  switch(__kmp_forkjoin_frames_mode) {
1190  case 1:
1191  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1192  this_thr->th.th_frame_time = cur_time;
1193  break;
1194  case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
1195  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1196  break;
1197  case 3:
1198  if( __itt_metadata_add_ptr ) {
1199  // Initialize with master's wait time
1200  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1201  for (i=1; i<nproc; ++i) {
1202  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1203  }
1204  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1205  }
1206  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1207  this_thr->th.th_frame_time = cur_time;
1208  break;
1209  }
1210  }
1211 #endif /* USE_ITT_BUILD */
1212  } else {
1213  status = 1;
1214 #if USE_ITT_BUILD
1215  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1216  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1217 #endif /* USE_ITT_BUILD */
1218  }
1219  if (status == 1 || ! is_split) {
1220  switch (__kmp_barrier_release_pattern[bt]) {
1221  case bp_hyper_bar: {
1222  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1223  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1224  USE_ITT_BUILD_ARG(itt_sync_obj) );
1225  break;
1226  }
1227  case bp_hierarchical_bar: {
1228  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1229  USE_ITT_BUILD_ARG(itt_sync_obj) );
1230  break;
1231  }
1232  case bp_tree_bar: {
1233  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1234  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1235  USE_ITT_BUILD_ARG(itt_sync_obj) );
1236  break;
1237  }
1238  default: {
1239  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1240  USE_ITT_BUILD_ARG(itt_sync_obj) );
1241  }
1242  }
1243  if (__kmp_tasking_mode != tskm_immediate_exec) {
1244  __kmp_task_team_sync(this_thr, team);
1245  }
1246  }
1247 
1248 #if USE_ITT_BUILD
1249  /* GEH: TODO: Move this under if-condition above and also include in
1250  __kmp_end_split_barrier(). This will more accurately represent the actual release time
1251  of the threads for split barriers. */
1252  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1253  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1254 #endif /* USE_ITT_BUILD */
1255  } else { // Team is serialized.
1256  status = 0;
1257  if (__kmp_tasking_mode != tskm_immediate_exec) {
1258 #if OMP_41_ENABLED
1259  if ( this_thr->th.th_task_team != NULL ) {
1260  void *itt_sync_obj = NULL;
1261 #if USE_ITT_NOTIFY
1262  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1263  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1264  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1265  }
1266 #endif
1267 
1268  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
1269  __kmp_task_team_wait(this_thr, team
1270  USE_ITT_BUILD_ARG(itt_sync_obj));
1271  __kmp_task_team_setup(this_thr, team, 0);
1272 
1273 #if USE_ITT_BUILD
1274  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1275  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1276 #endif /* USE_ITT_BUILD */
1277  }
1278 #else
1279  // The task team should be NULL for serialized code (tasks will be executed immediately)
1280  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1281  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1282 #endif
1283  }
1284  }
1285  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1286  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1287 
1288 #if OMPT_SUPPORT
1289  if (ompt_enabled) {
1290 #if OMPT_BLAME
1291  if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1292  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1293  my_parallel_id, my_task_id);
1294  }
1295 #endif
1296  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1297  }
1298 #endif
1299 
1300  return status;
1301 }
1302 
1303 
1304 void
1305 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1306 {
1307  KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
1308  int tid = __kmp_tid_from_gtid(gtid);
1309  kmp_info_t *this_thr = __kmp_threads[gtid];
1310  kmp_team_t *team = this_thr->th.th_team;
1311 
1312  if (!team->t.t_serialized) {
1313  if (KMP_MASTER_GTID(gtid)) {
1314  switch (__kmp_barrier_release_pattern[bt]) {
1315  case bp_hyper_bar: {
1316  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1317  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1318  USE_ITT_BUILD_ARG(NULL) );
1319  break;
1320  }
1321  case bp_hierarchical_bar: {
1322  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1323  USE_ITT_BUILD_ARG(NULL));
1324  break;
1325  }
1326  case bp_tree_bar: {
1327  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1328  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1329  USE_ITT_BUILD_ARG(NULL) );
1330  break;
1331  }
1332  default: {
1333  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1334  USE_ITT_BUILD_ARG(NULL) );
1335  }
1336  }
1337  if (__kmp_tasking_mode != tskm_immediate_exec) {
1338  __kmp_task_team_sync(this_thr, team);
1339  } // if
1340  }
1341  }
1342 }
1343 
1344 
1345 void
1346 __kmp_join_barrier(int gtid)
1347 {
1348  KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
1349  register kmp_info_t *this_thr = __kmp_threads[gtid];
1350  register kmp_team_t *team;
1351  register kmp_uint nproc;
1352  kmp_info_t *master_thread;
1353  int tid;
1354 #ifdef KMP_DEBUG
1355  int team_id;
1356 #endif /* KMP_DEBUG */
1357 #if USE_ITT_BUILD
1358  void *itt_sync_obj = NULL;
1359 # if USE_ITT_NOTIFY
1360  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1361  // Get object created at fork_barrier
1362  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1363 # endif
1364 #endif /* USE_ITT_BUILD */
1365  KMP_MB();
1366 
1367  // Get current info
1368  team = this_thr->th.th_team;
1369  nproc = this_thr->th.th_team_nproc;
1370  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1371  tid = __kmp_tid_from_gtid(gtid);
1372 #ifdef KMP_DEBUG
1373  team_id = team->t.t_id;
1374 #endif /* KMP_DEBUG */
1375  master_thread = this_thr->th.th_team_master;
1376 #ifdef KMP_DEBUG
1377  if (master_thread != team->t.t_threads[0]) {
1378  __kmp_print_structure();
1379  }
1380 #endif /* KMP_DEBUG */
1381  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1382  KMP_MB();
1383 
1384  // Verify state
1385  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1386  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1387  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1388  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1389  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1390 
1391 #if OMPT_SUPPORT
1392 #if OMPT_TRACE
1393  if (ompt_enabled &&
1394  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1395  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1396  team->t.ompt_team_info.parallel_id,
1397  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1398  }
1399 #endif
1400  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1401 #endif
1402 
1403  if (__kmp_tasking_mode == tskm_extra_barrier) {
1404  __kmp_tasking_barrier(team, this_thr, gtid);
1405  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1406  }
1407 # ifdef KMP_DEBUG
1408  if (__kmp_tasking_mode != tskm_immediate_exec) {
1409  KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1410  __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1411  this_thr->th.th_task_team));
1412  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1413  }
1414 # endif /* KMP_DEBUG */
1415 
1416  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1417  team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1418  down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1419  since the values are not used by __kmp_wait_template() in that case. */
1420  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1421  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1422  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1423  }
1424 
1425 #if USE_ITT_BUILD
1426  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1427  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1428 #endif /* USE_ITT_BUILD */
1429 
1430  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1431  case bp_hyper_bar: {
1432  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1433  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1434  USE_ITT_BUILD_ARG(itt_sync_obj) );
1435  break;
1436  }
1437  case bp_hierarchical_bar: {
1438  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1439  USE_ITT_BUILD_ARG(itt_sync_obj) );
1440  break;
1441  }
1442  case bp_tree_bar: {
1443  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1444  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1445  USE_ITT_BUILD_ARG(itt_sync_obj) );
1446  break;
1447  }
1448  default: {
1449  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1450  USE_ITT_BUILD_ARG(itt_sync_obj) );
1451  }
1452  }
1453 
1454  /* From this point on, the team data structure may be deallocated at any time by the
1455  master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1456  data items that need to be referenced before the end of the barrier should be moved to
1457  the kmp_task_team_t structs. */
1458  if (KMP_MASTER_TID(tid)) {
1459  if (__kmp_tasking_mode != tskm_immediate_exec) {
1460  // Master shouldn't call decrease_load(). // TODO: enable master threads.
1461  // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1462  __kmp_task_team_wait(this_thr, team
1463  USE_ITT_BUILD_ARG(itt_sync_obj) );
1464  }
1465 #if USE_ITT_BUILD
1466  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1467  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1468 #endif /* USE_ITT_BUILD */
1469 
1470 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1471  // Join barrier - report frame end
1472  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1473 #if OMP_40_ENABLED
1474  this_thr->th.th_teams_microtask == NULL &&
1475 #endif
1476  team->t.t_active_level == 1)
1477  {
1478  kmp_uint64 cur_time = __itt_get_timestamp();
1479  ident_t * loc = team->t.t_ident;
1480  kmp_info_t **other_threads = team->t.t_threads;
1481  int nproc = this_thr->th.th_team_nproc;
1482  int i;
1483  switch(__kmp_forkjoin_frames_mode) {
1484  case 1:
1485  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1486  break;
1487  case 2:
1488  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1489  break;
1490  case 3:
1491  if( __itt_metadata_add_ptr ) {
1492  // Initialize with master's wait time
1493  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1494  for (i=1; i<nproc; ++i) {
1495  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1496  }
1497  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1498  }
1499  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1500  this_thr->th.th_frame_time = cur_time;
1501  break;
1502  }
1503  }
1504 # endif /* USE_ITT_BUILD */
1505  }
1506 #if USE_ITT_BUILD
1507  else {
1508  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1509  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1510  }
1511 #endif /* USE_ITT_BUILD */
1512 
1513 #if KMP_DEBUG
1514  if (KMP_MASTER_TID(tid)) {
1515  KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1516  gtid, team_id, tid, nproc));
1517  }
1518 #endif /* KMP_DEBUG */
1519 
1520  // TODO now, mark worker threads as done so they may be disbanded
1521  KMP_MB(); // Flush all pending memory write invalidates.
1522  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1523 
1524 #if OMPT_SUPPORT
1525  if (ompt_enabled) {
1526 #if OMPT_BLAME
1527  if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1528  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1529  team->t.ompt_team_info.parallel_id,
1530  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1531  }
1532 #endif
1533 
1534  // return to default state
1535  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1536  }
1537 #endif
1538 }
1539 
1540 
1541 // TODO release worker threads' fork barriers as we are ready instead of all at once
1542 void
1543 __kmp_fork_barrier(int gtid, int tid)
1544 {
1545  KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
1546  kmp_info_t *this_thr = __kmp_threads[gtid];
1547  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1548 #if USE_ITT_BUILD
1549  void * itt_sync_obj = NULL;
1550 #endif /* USE_ITT_BUILD */
1551 
1552  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1553  gtid, (team != NULL) ? team->t.t_id : -1, tid));
1554 
1555  // th_team pointer only valid for master thread here
1556  if (KMP_MASTER_TID(tid)) {
1557 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1558  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1559  // Create itt barrier object
1560  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1561  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1562  }
1563 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1564 
1565 #ifdef KMP_DEBUG
1566  register kmp_info_t **other_threads = team->t.t_threads;
1567  register int i;
1568 
1569  // Verify state
1570  KMP_MB();
1571 
1572  for(i=1; i<team->t.t_nproc; ++i) {
1573  KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1574  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1575  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1576  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1577  KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1578  & ~(KMP_BARRIER_SLEEP_STATE))
1579  == KMP_INIT_BARRIER_STATE);
1580  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1581  }
1582 #endif
1583 
1584  if (__kmp_tasking_mode != tskm_immediate_exec) {
1585  __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1
1586  }
1587 
1588  /* The master thread may have changed its blocktime between the join barrier and the
1589  fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1590  access it when the team struct is not guaranteed to exist. */
1591  // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1592  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1593  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1594  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1595  }
1596  } // master
1597 
1598  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1599  case bp_hyper_bar: {
1600  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1601  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1602  USE_ITT_BUILD_ARG(itt_sync_obj) );
1603  break;
1604  }
1605  case bp_hierarchical_bar: {
1606  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1607  USE_ITT_BUILD_ARG(itt_sync_obj) );
1608  break;
1609  }
1610  case bp_tree_bar: {
1611  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1612  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1613  USE_ITT_BUILD_ARG(itt_sync_obj) );
1614  break;
1615  }
1616  default: {
1617  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1618  USE_ITT_BUILD_ARG(itt_sync_obj) );
1619  }
1620  }
1621 
1622  // Early exit for reaping threads releasing forkjoin barrier
1623  if (TCR_4(__kmp_global.g.g_done)) {
1624  this_thr->th.th_task_team = NULL;
1625 
1626 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1627  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1628  if (!KMP_MASTER_TID(tid)) {
1629  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1630  if (itt_sync_obj)
1631  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1632  }
1633  }
1634 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1635  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1636  return;
1637  }
1638 
1639  /* We can now assume that a valid team structure has been allocated by the master and
1640  propagated to all worker threads. The current thread, however, may not be part of the
1641  team, so we can't blindly assume that the team pointer is non-null. */
1642  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1643  KMP_DEBUG_ASSERT(team != NULL);
1644  tid = __kmp_tid_from_gtid(gtid);
1645 
1646 
1647 #if KMP_BARRIER_ICV_PULL
1648  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1649  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1650  this data before this function is called. We cannot modify __kmp_fork_call() to look at
1651  the fixed ICVs in the master's thread struct, because it is not always the case that the
1652  threads arrays have been allocated when __kmp_fork_call() is executed. */
1653  {
1654  KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
1655  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1656  // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1657  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1658  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1659  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1660  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1661  }
1662  }
1663 #endif // KMP_BARRIER_ICV_PULL
1664 
1665  if (__kmp_tasking_mode != tskm_immediate_exec) {
1666  __kmp_task_team_sync(this_thr, team);
1667  }
1668 
1669 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1670  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1671  if (proc_bind == proc_bind_intel) {
1672 #endif
1673 #if KMP_AFFINITY_SUPPORTED
1674  // Call dynamic affinity settings
1675  if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1676  __kmp_balanced_affinity(tid, team->t.t_nproc);
1677  }
1678 #endif // KMP_AFFINITY_SUPPORTED
1679 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1680  }
1681  else if (proc_bind != proc_bind_false) {
1682  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1683  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1684  __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1685  }
1686  else {
1687  __kmp_affinity_set_place(gtid);
1688  }
1689  }
1690 #endif
1691 
1692 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1693  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1694  if (!KMP_MASTER_TID(tid)) {
1695  // Get correct barrier object
1696  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1697  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1698  } // (prepare called inside barrier_release)
1699  }
1700 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1701  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1702 }
1703 
1704 
1705 void
1706 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1707 {
1708  KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
1709 
1710  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1711  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1712 
1713  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1714  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1715  this data before this function is called. */
1716 #if KMP_BARRIER_ICV_PULL
1717  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1718  all of the worker threads can access them and make their own copies after the barrier. */
1719  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1720  copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1721  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1722  0, team->t.t_threads[0], team));
1723 #elif KMP_BARRIER_ICV_PUSH
1724  // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1725  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1726  0, team->t.t_threads[0], team));
1727 #else
1728  // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1729  ngo_load(new_icvs);
1730  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1731  for (int f=1; f<new_nproc; ++f) { // Skip the master thread
1732  // TODO: GEH - pass in better source location info since usually NULL here
1733  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1734  f, team->t.t_threads[f], team));
1735  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1736  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1737  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1738  f, team->t.t_threads[f], team));
1739  }
1740  ngo_sync();
1741 #endif // KMP_BARRIER_ICV_PULL
1742 }
Definition: kmp.h:200