LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_stats.h"
19 #include "kmp_itt.h"
20 #include "kmp_os.h"
21 
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #include "tsan_annotations.h"
29 
30 #if KMP_MIC && USE_NGO_STORES
31 // ICV copying
32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
36 #else
37 #define ngo_load(src) ((void)0)
38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
40 #define ngo_sync() ((void)0)
41 #endif /* KMP_MIC && USE_NGO_STORES */
42 
43 void __kmp_print_structure(void); // Forward declaration
44 
45 // ---------------------------- Barrier Algorithms ----------------------------
46 
47 // Linear Barrier
48 static void
49 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50  void (*reduce)(void *, void *)
51  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
52 {
53  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
54  register kmp_team_t *team = this_thr->th.th_team;
55  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
56  register kmp_info_t **other_threads = team->t.t_threads;
57 
58  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59  gtid, team->t.t_id, tid, bt));
60  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63  // Barrier imbalance - save arrive time to the thread
64  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
66  }
67 #endif
68  // We now perform a linear reduction to signal that all of the threads have arrived.
69  if (!KMP_MASTER_TID(tid)) {
70  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
71  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
72  __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
73  thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
74  // Mark arrival to master thread
75  /* After performing this write, a worker thread may not assume that the team is valid
76  any more - it could be deallocated by the master thread at any time. */
77  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
78  flag.release();
79  } else {
80  register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
81  register int nproc = this_thr->th.th_team_nproc;
82  register int i;
83  // Don't have to worry about sleep bit here or atomic since team setting
84  register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
85 
86  // Collect all the worker team member threads.
87  for (i=1; i<nproc; ++i) {
88 #if KMP_CACHE_MANAGE
89  // Prefetch next thread's arrived count
90  if (i+1 < nproc)
91  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
92 #endif /* KMP_CACHE_MANAGE */
93  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
94  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
95  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
96  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
97 
98  // Wait for worker thread to arrive
99  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
100  flag.wait(this_thr, FALSE
101  USE_ITT_BUILD_ARG(itt_sync_obj) );
102 #if USE_ITT_BUILD && USE_ITT_NOTIFY
103  // Barrier imbalance - write min of the thread time and the other thread time to the thread.
104  if (__kmp_forkjoin_frames_mode == 2) {
105  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
106  other_threads[i]->th.th_bar_min_time);
107  }
108 #endif
109  if (reduce) {
110  KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
111  team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
112  ANNOTATE_REDUCE_AFTER(reduce);
113  (*reduce)(this_thr->th.th_local.reduce_data,
114  other_threads[i]->th.th_local.reduce_data);
115  ANNOTATE_REDUCE_BEFORE(reduce);
116  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
117  }
118  }
119  // Don't have to worry about sleep bit here or atomic since team setting
120  team_bar->b_arrived = new_state;
121  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
122  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
123  }
124  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
125  gtid, team->t.t_id, tid, bt));
126 }
127 
128 static void
129 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
130  int propagate_icvs
131  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
132 {
133  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
134  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
135  register kmp_team_t *team;
136 
137  if (KMP_MASTER_TID(tid)) {
138  register unsigned int i;
139  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
140  register kmp_info_t **other_threads;
141 
142  team = __kmp_threads[gtid]->th.th_team;
143  KMP_DEBUG_ASSERT(team != NULL);
144  other_threads = team->t.t_threads;
145 
146  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
147  gtid, team->t.t_id, tid, bt));
148 
149  if (nproc > 1) {
150 #if KMP_BARRIER_ICV_PUSH
151  {
152  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
153  if (propagate_icvs) {
154  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
155  for (i=1; i<nproc; ++i) {
156  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
157  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
158  &team->t.t_implicit_task_taskdata[0].td_icvs);
159  }
160  ngo_sync();
161  }
162  }
163 #endif // KMP_BARRIER_ICV_PUSH
164 
165  // Now, release all of the worker threads
166  for (i=1; i<nproc; ++i) {
167 #if KMP_CACHE_MANAGE
168  // Prefetch next thread's go flag
169  if (i+1 < nproc)
170  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
171 #endif /* KMP_CACHE_MANAGE */
172  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
173  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
174  other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
175  &other_threads[i]->th.th_bar[bt].bb.b_go,
176  other_threads[i]->th.th_bar[bt].bb.b_go,
177  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
178  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
179  flag.release();
180  }
181  }
182  } else { // Wait for the MASTER thread to release us
183  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
184  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
185  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
186  flag.wait(this_thr, TRUE
187  USE_ITT_BUILD_ARG(itt_sync_obj) );
188 #if USE_ITT_BUILD && USE_ITT_NOTIFY
189  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
190  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
191  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
192  // Cancel wait on previous parallel region...
193  __kmp_itt_task_starting(itt_sync_obj);
194 
195  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
196  return;
197 
198  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
199  if (itt_sync_obj != NULL)
200  // Call prepare as early as possible for "new" barrier
201  __kmp_itt_task_finished(itt_sync_obj);
202  } else
203 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
204  // Early exit for reaping threads releasing forkjoin barrier
205  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
206  return;
207  // The worker thread may now assume that the team is valid.
208 #ifdef KMP_DEBUG
209  tid = __kmp_tid_from_gtid(gtid);
210  team = __kmp_threads[gtid]->th.th_team;
211 #endif
212  KMP_DEBUG_ASSERT(team != NULL);
213  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
214  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
215  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
216  KMP_MB(); // Flush all pending memory write invalidates.
217  }
218  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
219  gtid, team->t.t_id, tid, bt));
220 }
221 
222 // Tree barrier
223 static void
224 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
225  void (*reduce)(void *, void *)
226  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
227 {
228  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
229  register kmp_team_t *team = this_thr->th.th_team;
230  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
231  register kmp_info_t **other_threads = team->t.t_threads;
232  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
233  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
234  register kmp_uint32 branch_factor = 1 << branch_bits;
235  register kmp_uint32 child;
236  register kmp_uint32 child_tid;
237  register kmp_uint64 new_state;
238 
239  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
240  gtid, team->t.t_id, tid, bt));
241  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
242 
243 #if USE_ITT_BUILD && USE_ITT_NOTIFY
244  // Barrier imbalance - save arrive time to the thread
245  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
246  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
247  }
248 #endif
249  // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
250  child_tid = (tid << branch_bits) + 1;
251  if (child_tid < nproc) {
252  // Parent threads wait for all their children to arrive
253  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
254  child = 1;
255  do {
256  register kmp_info_t *child_thr = other_threads[child_tid];
257  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
258 #if KMP_CACHE_MANAGE
259  // Prefetch next thread's arrived count
260  if (child+1 <= branch_factor && child_tid+1 < nproc)
261  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
262 #endif /* KMP_CACHE_MANAGE */
263  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
264  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
265  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
266  &child_bar->b_arrived, new_state));
267  // Wait for child to arrive
268  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
269  flag.wait(this_thr, FALSE
270  USE_ITT_BUILD_ARG(itt_sync_obj) );
271 #if USE_ITT_BUILD && USE_ITT_NOTIFY
272  // Barrier imbalance - write min of the thread time and a child time to the thread.
273  if (__kmp_forkjoin_frames_mode == 2) {
274  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
275  child_thr->th.th_bar_min_time);
276  }
277 #endif
278  if (reduce) {
279  KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
280  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
281  team->t.t_id, child_tid));
282  ANNOTATE_REDUCE_AFTER(reduce);
283  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
284  ANNOTATE_REDUCE_BEFORE(reduce);
285  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
286  }
287  child++;
288  child_tid++;
289  }
290  while (child <= branch_factor && child_tid < nproc);
291  }
292 
293  if (!KMP_MASTER_TID(tid)) { // Worker threads
294  register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
295 
296  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
297  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
298  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
299  &thr_bar->b_arrived, thr_bar->b_arrived,
300  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
301 
302  // Mark arrival to parent thread
303  /* After performing this write, a worker thread may not assume that the team is valid
304  any more - it could be deallocated by the master thread at any time. */
305  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
306  flag.release();
307  } else {
308  // Need to update the team arrived pointer if we are the master thread
309  if (nproc > 1) // New value was already computed above
310  team->t.t_bar[bt].b_arrived = new_state;
311  else
312  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
313  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
314  gtid, team->t.t_id, tid, team->t.t_id,
315  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
316  }
317  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
318  gtid, team->t.t_id, tid, bt));
319 }
320 
321 static void
322 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
323  int propagate_icvs
324  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
325 {
326  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
327  register kmp_team_t *team;
328  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
329  register kmp_uint32 nproc;
330  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
331  register kmp_uint32 branch_factor = 1 << branch_bits;
332  register kmp_uint32 child;
333  register kmp_uint32 child_tid;
334 
335  // Perform a tree release for all of the threads that have been gathered
336  if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
337  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
338  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
339  // Wait for parent thread to release us
340  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
341  flag.wait(this_thr, TRUE
342  USE_ITT_BUILD_ARG(itt_sync_obj) );
343 #if USE_ITT_BUILD && USE_ITT_NOTIFY
344  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
345  // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
346  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
347  // Cancel wait on previous parallel region...
348  __kmp_itt_task_starting(itt_sync_obj);
349 
350  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
351  return;
352 
353  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
354  if (itt_sync_obj != NULL)
355  // Call prepare as early as possible for "new" barrier
356  __kmp_itt_task_finished(itt_sync_obj);
357  } else
358 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
359  // Early exit for reaping threads releasing forkjoin barrier
360  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
361  return;
362 
363  // The worker thread may now assume that the team is valid.
364  team = __kmp_threads[gtid]->th.th_team;
365  KMP_DEBUG_ASSERT(team != NULL);
366  tid = __kmp_tid_from_gtid(gtid);
367 
368  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
369  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
370  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
371  KMP_MB(); // Flush all pending memory write invalidates.
372  } else {
373  team = __kmp_threads[gtid]->th.th_team;
374  KMP_DEBUG_ASSERT(team != NULL);
375  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
376  gtid, team->t.t_id, tid, bt));
377  }
378  nproc = this_thr->th.th_team_nproc;
379  child_tid = (tid << branch_bits) + 1;
380 
381  if (child_tid < nproc) {
382  register kmp_info_t **other_threads = team->t.t_threads;
383  child = 1;
384  // Parent threads release all their children
385  do {
386  register kmp_info_t *child_thr = other_threads[child_tid];
387  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
388 #if KMP_CACHE_MANAGE
389  // Prefetch next thread's go count
390  if (child+1 <= branch_factor && child_tid+1 < nproc)
391  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
392 #endif /* KMP_CACHE_MANAGE */
393 
394 #if KMP_BARRIER_ICV_PUSH
395  {
396  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
397  if (propagate_icvs) {
398  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
399  team, child_tid, FALSE);
400  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
401  &team->t.t_implicit_task_taskdata[0].td_icvs);
402  }
403  }
404 #endif // KMP_BARRIER_ICV_PUSH
405  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
406  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
407  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
408  child_tid, &child_bar->b_go, child_bar->b_go,
409  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
410  // Release child from barrier
411  kmp_flag_64 flag(&child_bar->b_go, child_thr);
412  flag.release();
413  child++;
414  child_tid++;
415  }
416  while (child <= branch_factor && child_tid < nproc);
417  }
418  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
419  gtid, team->t.t_id, tid, bt));
420 }
421 
422 
423 // Hyper Barrier
424 static void
425 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
426  void (*reduce)(void *, void *)
427  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
428 {
429  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
430  register kmp_team_t *team = this_thr->th.th_team;
431  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
432  register kmp_info_t **other_threads = team->t.t_threads;
433  register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
434  register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
435  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
436  register kmp_uint32 branch_factor = 1 << branch_bits;
437  register kmp_uint32 offset;
438  register kmp_uint32 level;
439 
440  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
441  gtid, team->t.t_id, tid, bt));
442 
443  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
444 
445 #if USE_ITT_BUILD && USE_ITT_NOTIFY
446  // Barrier imbalance - save arrive time to the thread
447  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
448  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
449  }
450 #endif
451  /* Perform a hypercube-embedded tree gather to wait until all of the threads have
452  arrived, and reduce any required data as we go. */
453  kmp_flag_64 p_flag(&thr_bar->b_arrived);
454  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
455  {
456  register kmp_uint32 child;
457  register kmp_uint32 child_tid;
458 
459  if (((tid >> level) & (branch_factor - 1)) != 0) {
460  register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
461 
462  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
463  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
464  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
465  &thr_bar->b_arrived, thr_bar->b_arrived,
466  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
467  // Mark arrival to parent thread
468  /* After performing this write (in the last iteration of the enclosing for loop),
469  a worker thread may not assume that the team is valid any more - it could be
470  deallocated by the master thread at any time. */
471  p_flag.set_waiter(other_threads[parent_tid]);
472  p_flag.release();
473  break;
474  }
475 
476  // Parent threads wait for children to arrive
477  if (new_state == KMP_BARRIER_UNUSED_STATE)
478  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
479  for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
480  child++, child_tid+=(1 << level))
481  {
482  register kmp_info_t *child_thr = other_threads[child_tid];
483  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
484 #if KMP_CACHE_MANAGE
485  register kmp_uint32 next_child_tid = child_tid + (1 << level);
486  // Prefetch next thread's arrived count
487  if (child+1 < branch_factor && next_child_tid < num_threads)
488  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
489 #endif /* KMP_CACHE_MANAGE */
490  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
491  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
492  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
493  &child_bar->b_arrived, new_state));
494  // Wait for child to arrive
495  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
496  c_flag.wait(this_thr, FALSE
497  USE_ITT_BUILD_ARG(itt_sync_obj) );
498 #if USE_ITT_BUILD && USE_ITT_NOTIFY
499  // Barrier imbalance - write min of the thread time and a child time to the thread.
500  if (__kmp_forkjoin_frames_mode == 2) {
501  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
502  child_thr->th.th_bar_min_time);
503  }
504 #endif
505  if (reduce) {
506  KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
507  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
508  team->t.t_id, child_tid));
509  ANNOTATE_REDUCE_AFTER(reduce);
510  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
511  ANNOTATE_REDUCE_BEFORE(reduce);
512  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
513  }
514  }
515  }
516 
517  if (KMP_MASTER_TID(tid)) {
518  // Need to update the team arrived pointer if we are the master thread
519  if (new_state == KMP_BARRIER_UNUSED_STATE)
520  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
521  else
522  team->t.t_bar[bt].b_arrived = new_state;
523  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
524  gtid, team->t.t_id, tid, team->t.t_id,
525  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
526  }
527  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
528  gtid, team->t.t_id, tid, bt));
529 }
530 
531 // The reverse versions seem to beat the forward versions overall
532 #define KMP_REVERSE_HYPER_BAR
533 static void
534 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
535  int propagate_icvs
536  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
537 {
538  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
539  register kmp_team_t *team;
540  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
541  register kmp_info_t **other_threads;
542  register kmp_uint32 num_threads;
543  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
544  register kmp_uint32 branch_factor = 1 << branch_bits;
545  register kmp_uint32 child;
546  register kmp_uint32 child_tid;
547  register kmp_uint32 offset;
548  register kmp_uint32 level;
549 
550  /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
551  If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
552  order of the corresponding gather, otherwise threads are released in the same order. */
553  if (KMP_MASTER_TID(tid)) { // master
554  team = __kmp_threads[gtid]->th.th_team;
555  KMP_DEBUG_ASSERT(team != NULL);
556  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
557  gtid, team->t.t_id, tid, bt));
558 #if KMP_BARRIER_ICV_PUSH
559  if (propagate_icvs) { // master already has ICVs in final destination; copy
560  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
561  }
562 #endif
563  }
564  else { // Handle fork barrier workers who aren't part of a team yet
565  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
566  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
567  // Wait for parent thread to release us
568  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
569  flag.wait(this_thr, TRUE
570  USE_ITT_BUILD_ARG(itt_sync_obj) );
571 #if USE_ITT_BUILD && USE_ITT_NOTIFY
572  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
573  // In fork barrier where we could not get the object reliably
574  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
575  // Cancel wait on previous parallel region...
576  __kmp_itt_task_starting(itt_sync_obj);
577 
578  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
579  return;
580 
581  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
582  if (itt_sync_obj != NULL)
583  // Call prepare as early as possible for "new" barrier
584  __kmp_itt_task_finished(itt_sync_obj);
585  } else
586 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
587  // Early exit for reaping threads releasing forkjoin barrier
588  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
589  return;
590 
591  // The worker thread may now assume that the team is valid.
592  team = __kmp_threads[gtid]->th.th_team;
593  KMP_DEBUG_ASSERT(team != NULL);
594  tid = __kmp_tid_from_gtid(gtid);
595 
596  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
597  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
598  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
599  KMP_MB(); // Flush all pending memory write invalidates.
600  }
601  num_threads = this_thr->th.th_team_nproc;
602  other_threads = team->t.t_threads;
603 
604 #ifdef KMP_REVERSE_HYPER_BAR
605  // Count up to correct level for parent
606  for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
607  level+=branch_bits, offset<<=branch_bits);
608 
609  // Now go down from there
610  for (level-=branch_bits, offset>>=branch_bits; offset != 0;
611  level-=branch_bits, offset>>=branch_bits)
612 #else
613  // Go down the tree, level by level
614  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
615 #endif // KMP_REVERSE_HYPER_BAR
616  {
617 #ifdef KMP_REVERSE_HYPER_BAR
618  /* Now go in reverse order through the children, highest to lowest.
619  Initial setting of child is conservative here. */
620  child = num_threads >> ((level==0)?level:level-1);
621  for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
622  child>=1; child--, child_tid-=(1<<level))
623 #else
624  if (((tid >> level) & (branch_factor - 1)) != 0)
625  // No need to go lower than this, since this is the level parent would be notified
626  break;
627  // Iterate through children on this level of the tree
628  for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
629  child++, child_tid+=(1<<level))
630 #endif // KMP_REVERSE_HYPER_BAR
631  {
632  if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
633  else {
634  register kmp_info_t *child_thr = other_threads[child_tid];
635  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
636 #if KMP_CACHE_MANAGE
637  register kmp_uint32 next_child_tid = child_tid - (1 << level);
638  // Prefetch next thread's go count
639 # ifdef KMP_REVERSE_HYPER_BAR
640  if (child-1 >= 1 && next_child_tid < num_threads)
641 # else
642  if (child+1 < branch_factor && next_child_tid < num_threads)
643 # endif // KMP_REVERSE_HYPER_BAR
644  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
645 #endif /* KMP_CACHE_MANAGE */
646 
647 #if KMP_BARRIER_ICV_PUSH
648  if (propagate_icvs) // push my fixed ICVs to my child
649  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
650 #endif // KMP_BARRIER_ICV_PUSH
651 
652  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
653  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
654  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
655  child_tid, &child_bar->b_go, child_bar->b_go,
656  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
657  // Release child from barrier
658  kmp_flag_64 flag(&child_bar->b_go, child_thr);
659  flag.release();
660  }
661  }
662  }
663 #if KMP_BARRIER_ICV_PUSH
664  if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
665  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
666  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
667  }
668 #endif
669  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
670  gtid, team->t.t_id, tid, bt));
671 }
672 
673 // Hierarchical Barrier
674 
675 // Initialize thread barrier data
676 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
677  minimum amount of initialization required based on how the team has changed. Returns true if
678  leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
679  team size increases, threads already in the team will respond to on-core wakeup on their parent
680  thread, but threads newly added to the team will only be listening on the their local b_go. */
681 static bool
682 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
683  int gtid, int tid, kmp_team_t *team)
684 {
685  // Checks to determine if (re-)initialization is needed
686  bool uninitialized = thr_bar->team == NULL;
687  bool team_changed = team != thr_bar->team;
688  bool team_sz_changed = nproc != thr_bar->nproc;
689  bool tid_changed = tid != thr_bar->old_tid;
690  bool retval = false;
691 
692  if (uninitialized || team_sz_changed) {
693  __kmp_get_hierarchy(nproc, thr_bar);
694  }
695 
696  if (uninitialized || team_sz_changed || tid_changed) {
697  thr_bar->my_level = thr_bar->depth-1; // default for master
698  thr_bar->parent_tid = -1; // default for master
699  if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
700  kmp_uint32 d=0;
701  while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
702  kmp_uint32 rem;
703  if (d == thr_bar->depth-2) { // reached level right below the master
704  thr_bar->parent_tid = 0;
705  thr_bar->my_level = d;
706  break;
707  }
708  else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
709  // thread is not a subtree root at next level, so this is max
710  thr_bar->parent_tid = tid - rem;
711  thr_bar->my_level = d;
712  break;
713  }
714  ++d;
715  }
716  }
717  thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
718  thr_bar->old_tid = tid;
719  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
720  thr_bar->team = team;
721  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
722  }
723  if (uninitialized || team_changed || tid_changed) {
724  thr_bar->team = team;
725  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
726  retval = true;
727  }
728  if (uninitialized || team_sz_changed || tid_changed) {
729  thr_bar->nproc = nproc;
730  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
731  if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
732  if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
733  thr_bar->leaf_kids = nproc - tid - 1;
734  thr_bar->leaf_state = 0;
735  for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
736  }
737  return retval;
738 }
739 
740 static void
741 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
742  int gtid, int tid, void (*reduce) (void *, void *)
743  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
744 {
745  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
746  register kmp_team_t *team = this_thr->th.th_team;
747  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
748  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
749  register kmp_info_t **other_threads = team->t.t_threads;
750  register kmp_uint64 new_state;
751 
752  int level = team->t.t_level;
753 #if OMP_40_ENABLED
754  if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
755  if (this_thr->th.th_teams_size.nteams > 1)
756  ++level; // level was not increased in teams construct for team_of_masters
757 #endif
758  if (level == 1) thr_bar->use_oncore_barrier = 1;
759  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
760 
761  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
762  gtid, team->t.t_id, tid, bt));
763  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
764 
765 #if USE_ITT_BUILD && USE_ITT_NOTIFY
766  // Barrier imbalance - save arrive time to the thread
767  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
768  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
769  }
770 #endif
771 
772  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
773 
774  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
775  register kmp_int32 child_tid;
776  new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
777  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
778  if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
779  kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
780  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
781  gtid, team->t.t_id, tid));
782  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
783  flag.wait(this_thr, FALSE
784  USE_ITT_BUILD_ARG(itt_sync_obj) );
785  if (reduce) {
786  ANNOTATE_REDUCE_AFTER(reduce);
787  for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
788  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
789  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
790  team->t.t_id, child_tid));
791  (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
792  }
793  ANNOTATE_REDUCE_BEFORE(reduce);
794  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
795  }
796  (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
797  }
798  // Next, wait for higher level children on each child's b_arrived flag
799  for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
800  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
801  if (last > nproc) last = nproc;
802  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
803  register kmp_info_t *child_thr = other_threads[child_tid];
804  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
805  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
806  "arrived(%p) == %llu\n",
807  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
808  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
809  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
810  flag.wait(this_thr, FALSE
811  USE_ITT_BUILD_ARG(itt_sync_obj) );
812  if (reduce) {
813  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
814  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
815  team->t.t_id, child_tid));
816  ANNOTATE_REDUCE_AFTER(reduce);
817  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
818  ANNOTATE_REDUCE_BEFORE(reduce);
819  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
820  }
821  }
822  }
823  }
824  else { // Blocktime is not infinite
825  for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
826  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
827  if (last > nproc) last = nproc;
828  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
829  register kmp_info_t *child_thr = other_threads[child_tid];
830  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
831  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
832  "arrived(%p) == %llu\n",
833  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
834  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
835  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
836  flag.wait(this_thr, FALSE
837  USE_ITT_BUILD_ARG(itt_sync_obj) );
838  if (reduce) {
839  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
840  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
841  team->t.t_id, child_tid));
842  ANNOTATE_REDUCE_AFTER(reduce);
843  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
844  ANNOTATE_REDUCE_BEFORE(reduce);
845  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
846  }
847  }
848  }
849  }
850  }
851  // All subordinates are gathered; now release parent if not master thread
852 
853  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
854  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
855  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
856  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
857  &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
858  /* Mark arrival to parent: After performing this write, a worker thread may not assume that
859  the team is valid any more - it could be deallocated by the master thread at any time. */
860  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
861  || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
862  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
863  flag.release();
864  }
865  else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
866  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
867  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
868  flag.set_waiter(other_threads[thr_bar->parent_tid]);
869  flag.release();
870  }
871  } else { // Master thread needs to update the team's b_arrived value
872  team->t.t_bar[bt].b_arrived = new_state;
873  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
874  gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
875  }
876  // Is the team access below unsafe or just technically invalid?
877  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
878  gtid, team->t.t_id, tid, bt));
879 }
880 
881 static void
882 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
883  int propagate_icvs
884  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
885 {
886  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
887  register kmp_team_t *team;
888  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
889  register kmp_uint32 nproc;
890  bool team_change = false; // indicates on-core barrier shouldn't be used
891 
892  if (KMP_MASTER_TID(tid)) {
893  team = __kmp_threads[gtid]->th.th_team;
894  KMP_DEBUG_ASSERT(team != NULL);
895  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
896  gtid, team->t.t_id, tid, bt));
897  }
898  else { // Worker threads
899  // Wait for parent thread to release me
900  if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
901  || thr_bar->my_level != 0 || thr_bar->team == NULL) {
902  // Use traditional method of waiting on my own b_go flag
903  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
904  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
905  flag.wait(this_thr, TRUE
906  USE_ITT_BUILD_ARG(itt_sync_obj) );
907  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
908  }
909  else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
910  // Wait on my "offset" bits on parent's b_go flag
911  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
912  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
913  bt, this_thr
914  USE_ITT_BUILD_ARG(itt_sync_obj) );
915  flag.wait(this_thr, TRUE);
916  if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
917  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
918  }
919  else { // Reset my bits on parent's b_go flag
920  ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
921  }
922  }
923  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
924  // Early exit for reaping threads releasing forkjoin barrier
925  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
926  return;
927  // The worker thread may now assume that the team is valid.
928  team = __kmp_threads[gtid]->th.th_team;
929  KMP_DEBUG_ASSERT(team != NULL);
930  tid = __kmp_tid_from_gtid(gtid);
931 
932  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
933  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
934  KMP_MB(); // Flush all pending memory write invalidates.
935  }
936 
937  nproc = this_thr->th.th_team_nproc;
938  int level = team->t.t_level;
939 #if OMP_40_ENABLED
940  if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
941  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
942  ++level; // level was not increased in teams construct for team_of_workers
943  if( this_thr->th.th_teams_size.nteams > 1 )
944  ++level; // level was not increased in teams construct for team_of_masters
945  }
946 #endif
947  if (level == 1) thr_bar->use_oncore_barrier = 1;
948  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
949 
950  // If the team size has increased, we still communicate with old leaves via oncore barrier.
951  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
952  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
953  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
954  // But if the entire team changes, we won't use oncore barrier at all
955  if (team_change) old_leaf_kids = 0;
956 
957 #if KMP_BARRIER_ICV_PUSH
958  if (propagate_icvs) {
959  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
960  if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
961  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
962  }
963  else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
964  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
965  // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
966  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
967  &thr_bar->parent_bar->th_fixed_icvs);
968  // non-leaves will get ICVs piggybacked with b_go via NGO store
969  }
970  else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
971  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
972  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
973  else // leaves copy parent's fixed ICVs directly to local ICV store
974  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
975  &thr_bar->parent_bar->th_fixed_icvs);
976  }
977  }
978 #endif // KMP_BARRIER_ICV_PUSH
979 
980  // Now, release my children
981  if (thr_bar->my_level) { // not a leaf
982  register kmp_int32 child_tid;
983  kmp_uint32 last;
984  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
985  if (KMP_MASTER_TID(tid)) { // do a flat release
986  // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
987  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
988  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
989  ngo_load(&thr_bar->th_fixed_icvs);
990  // This loops over all the threads skipping only the leaf nodes in the hierarchy
991  for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
992  register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
993  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
994  " go(%p): %u => %u\n",
995  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
996  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
997  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
998  // Use ngo store (if available) to both store ICVs and release child via child's b_go
999  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1000  }
1001  ngo_sync();
1002  }
1003  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1004  // Now, release leaf children
1005  if (thr_bar->leaf_kids) { // if there are any
1006  // We test team_change on the off-chance that the level 1 team changed.
1007  if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
1008  if (old_leaf_kids) { // release old leaf kids
1009  thr_bar->b_go |= old_leaf_state;
1010  }
1011  // Release new leaf kids
1012  last = tid+thr_bar->skip_per_level[1];
1013  if (last > nproc) last = nproc;
1014  for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
1015  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1016  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1017  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1018  " T#%d(%d:%d) go(%p): %u => %u\n",
1019  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1020  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1021  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1022  // Release child using child's b_go flag
1023  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1024  flag.release();
1025  }
1026  }
1027  else { // Release all children at once with leaf_state bits on my own b_go flag
1028  thr_bar->b_go |= thr_bar->leaf_state;
1029  }
1030  }
1031  }
1032  else { // Blocktime is not infinite; do a simple hierarchical release
1033  for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1034  last = tid+thr_bar->skip_per_level[d+1];
1035  kmp_uint32 skip = thr_bar->skip_per_level[d];
1036  if (last > nproc) last = nproc;
1037  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1038  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1039  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1040  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1041  " go(%p): %u => %u\n",
1042  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1043  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1044  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1045  // Release child using child's b_go flag
1046  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1047  flag.release();
1048  }
1049  }
1050  }
1051 #if KMP_BARRIER_ICV_PUSH
1052  if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1053  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1054 #endif // KMP_BARRIER_ICV_PUSH
1055  }
1056  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1057  gtid, team->t.t_id, tid, bt));
1058 }
1059 
1060 // ---------------------------- End of Barrier Algorithms ----------------------------
1061 
1062 // Internal function to do a barrier.
1063 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1064  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1065  Returns 0 if master thread, 1 if worker thread. */
1066 int
1067 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1068  void *reduce_data, void (*reduce)(void *, void *))
1069 {
1070  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1071  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1072  register int tid = __kmp_tid_from_gtid(gtid);
1073  register kmp_info_t *this_thr = __kmp_threads[gtid];
1074  register kmp_team_t *team = this_thr->th.th_team;
1075  register int status = 0;
1076  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1077 #if OMPT_SUPPORT
1078  ompt_task_id_t my_task_id;
1079  ompt_parallel_id_t my_parallel_id;
1080 #endif
1081 
1082  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1083  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1084 
1085  ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
1086 #if OMPT_SUPPORT
1087  if (ompt_enabled) {
1088 #if OMPT_BLAME
1089  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1090  my_parallel_id = team->t.ompt_team_info.parallel_id;
1091 
1092 #if OMPT_TRACE
1093  if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1094  if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1095  ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1096  my_parallel_id, my_task_id);
1097  }
1098  }
1099 #endif
1100  if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1101  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1102  my_parallel_id, my_task_id);
1103  }
1104 #endif
1105  // It is OK to report the barrier state after the barrier begin callback.
1106  // According to the OMPT specification, a compliant implementation may
1107  // even delay reporting this state until the barrier begins to wait.
1108  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1109  }
1110 #endif
1111 
1112  if (! team->t.t_serialized) {
1113 #if USE_ITT_BUILD
1114  // This value will be used in itt notify events below.
1115  void *itt_sync_obj = NULL;
1116 # if USE_ITT_NOTIFY
1117  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1118  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1119 # endif
1120 #endif /* USE_ITT_BUILD */
1121  if (__kmp_tasking_mode == tskm_extra_barrier) {
1122  __kmp_tasking_barrier(team, this_thr, gtid);
1123  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1124  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1125  }
1126 
1127  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1128  the team struct is not guaranteed to exist. */
1129  // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1130  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1131 #if KMP_USE_MONITOR
1132  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1133 #endif
1134  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1135  }
1136 
1137 #if USE_ITT_BUILD
1138  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1139  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1140 #endif /* USE_ITT_BUILD */
1141 #if USE_DEBUGGER
1142  // Let the debugger know: the thread arrived to the barrier and waiting.
1143  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1144  team->t.t_bar[bt].b_master_arrived += 1;
1145  } else {
1146  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1147  } // if
1148 #endif /* USE_DEBUGGER */
1149  if (reduce != NULL) {
1150  //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1151  this_thr->th.th_local.reduce_data = reduce_data;
1152  }
1153 
1154  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1155  __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1
1156 
1157  switch (__kmp_barrier_gather_pattern[bt]) {
1158  case bp_hyper_bar: {
1159  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1160  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1161  USE_ITT_BUILD_ARG(itt_sync_obj) );
1162  break;
1163  }
1164  case bp_hierarchical_bar: {
1165  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1166  USE_ITT_BUILD_ARG(itt_sync_obj));
1167  break;
1168  }
1169  case bp_tree_bar: {
1170  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1171  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1172  USE_ITT_BUILD_ARG(itt_sync_obj) );
1173  break;
1174  }
1175  default: {
1176  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1177  USE_ITT_BUILD_ARG(itt_sync_obj) );
1178  }
1179  }
1180 
1181  KMP_MB();
1182 
1183  if (KMP_MASTER_TID(tid)) {
1184  status = 0;
1185  if (__kmp_tasking_mode != tskm_immediate_exec) {
1186  __kmp_task_team_wait(this_thr, team
1187  USE_ITT_BUILD_ARG(itt_sync_obj) );
1188  }
1189 #if USE_DEBUGGER
1190  // Let the debugger know: All threads are arrived and starting leaving the barrier.
1191  team->t.t_bar[bt].b_team_arrived += 1;
1192 #endif
1193 
1194 #if USE_ITT_BUILD
1195  /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1196  before the final summation into the shared variable is done (final summation can be a
1197  long operation for array reductions). */
1198  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1199  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1200 #endif /* USE_ITT_BUILD */
1201 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1202  // Barrier - report frame end (only if active_level == 1)
1203  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1204 #if OMP_40_ENABLED
1205  this_thr->th.th_teams_microtask == NULL &&
1206 #endif
1207  team->t.t_active_level == 1)
1208  {
1209  kmp_uint64 cur_time = __itt_get_timestamp();
1210  kmp_info_t **other_threads = team->t.t_threads;
1211  int nproc = this_thr->th.th_team_nproc;
1212  int i;
1213  switch(__kmp_forkjoin_frames_mode) {
1214  case 1:
1215  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1216  this_thr->th.th_frame_time = cur_time;
1217  break;
1218  case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
1219  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1220  break;
1221  case 3:
1222  if( __itt_metadata_add_ptr ) {
1223  // Initialize with master's wait time
1224  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1225  // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1226  this_thr->th.th_bar_arrive_time = 0;
1227  for (i=1; i<nproc; ++i) {
1228  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1229  other_threads[i]->th.th_bar_arrive_time = 0;
1230  }
1231  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1232  }
1233  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1234  this_thr->th.th_frame_time = cur_time;
1235  break;
1236  }
1237  }
1238 #endif /* USE_ITT_BUILD */
1239  } else {
1240  status = 1;
1241 #if USE_ITT_BUILD
1242  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1243  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1244 #endif /* USE_ITT_BUILD */
1245  }
1246  if (status == 1 || ! is_split) {
1247  switch (__kmp_barrier_release_pattern[bt]) {
1248  case bp_hyper_bar: {
1249  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1250  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1251  USE_ITT_BUILD_ARG(itt_sync_obj) );
1252  break;
1253  }
1254  case bp_hierarchical_bar: {
1255  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1256  USE_ITT_BUILD_ARG(itt_sync_obj) );
1257  break;
1258  }
1259  case bp_tree_bar: {
1260  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1261  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1262  USE_ITT_BUILD_ARG(itt_sync_obj) );
1263  break;
1264  }
1265  default: {
1266  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1267  USE_ITT_BUILD_ARG(itt_sync_obj) );
1268  }
1269  }
1270  if (__kmp_tasking_mode != tskm_immediate_exec) {
1271  __kmp_task_team_sync(this_thr, team);
1272  }
1273  }
1274 
1275 #if USE_ITT_BUILD
1276  /* GEH: TODO: Move this under if-condition above and also include in
1277  __kmp_end_split_barrier(). This will more accurately represent the actual release time
1278  of the threads for split barriers. */
1279  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1280  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1281 #endif /* USE_ITT_BUILD */
1282  } else { // Team is serialized.
1283  status = 0;
1284  if (__kmp_tasking_mode != tskm_immediate_exec) {
1285 #if OMP_45_ENABLED
1286  if ( this_thr->th.th_task_team != NULL ) {
1287  void *itt_sync_obj = NULL;
1288 #if USE_ITT_NOTIFY
1289  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1290  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1291  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1292  }
1293 #endif
1294 
1295  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
1296  __kmp_task_team_wait(this_thr, team
1297  USE_ITT_BUILD_ARG(itt_sync_obj));
1298  __kmp_task_team_setup(this_thr, team, 0);
1299 
1300 #if USE_ITT_BUILD
1301  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1302  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1303 #endif /* USE_ITT_BUILD */
1304  }
1305 #else
1306  // The task team should be NULL for serialized code (tasks will be executed immediately)
1307  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1308  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1309 #endif
1310  }
1311  }
1312  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1313  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1314 
1315 #if OMPT_SUPPORT
1316  if (ompt_enabled) {
1317 #if OMPT_BLAME
1318  if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1319  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1320  my_parallel_id, my_task_id);
1321  }
1322 #endif
1323  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1324  }
1325 #endif
1326  ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1327 
1328  return status;
1329 }
1330 
1331 
1332 void
1333 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1334 {
1335  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1336  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1337  int tid = __kmp_tid_from_gtid(gtid);
1338  kmp_info_t *this_thr = __kmp_threads[gtid];
1339  kmp_team_t *team = this_thr->th.th_team;
1340 
1341  ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
1342  if (!team->t.t_serialized) {
1343  if (KMP_MASTER_GTID(gtid)) {
1344  switch (__kmp_barrier_release_pattern[bt]) {
1345  case bp_hyper_bar: {
1346  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1347  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1348  USE_ITT_BUILD_ARG(NULL) );
1349  break;
1350  }
1351  case bp_hierarchical_bar: {
1352  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1353  USE_ITT_BUILD_ARG(NULL));
1354  break;
1355  }
1356  case bp_tree_bar: {
1357  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1358  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1359  USE_ITT_BUILD_ARG(NULL) );
1360  break;
1361  }
1362  default: {
1363  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1364  USE_ITT_BUILD_ARG(NULL) );
1365  }
1366  }
1367  if (__kmp_tasking_mode != tskm_immediate_exec) {
1368  __kmp_task_team_sync(this_thr, team);
1369  } // if
1370  }
1371  }
1372  ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1373 }
1374 
1375 
1376 void
1377 __kmp_join_barrier(int gtid)
1378 {
1379  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1380  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1381  register kmp_info_t *this_thr = __kmp_threads[gtid];
1382  register kmp_team_t *team;
1383  register kmp_uint nproc;
1384  kmp_info_t *master_thread;
1385  int tid;
1386 #ifdef KMP_DEBUG
1387  int team_id;
1388 #endif /* KMP_DEBUG */
1389 #if USE_ITT_BUILD
1390  void *itt_sync_obj = NULL;
1391 # if USE_ITT_NOTIFY
1392  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1393  // Get object created at fork_barrier
1394  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1395 # endif
1396 #endif /* USE_ITT_BUILD */
1397  KMP_MB();
1398 
1399  // Get current info
1400  team = this_thr->th.th_team;
1401  nproc = this_thr->th.th_team_nproc;
1402  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1403  tid = __kmp_tid_from_gtid(gtid);
1404 #ifdef KMP_DEBUG
1405  team_id = team->t.t_id;
1406 #endif /* KMP_DEBUG */
1407  master_thread = this_thr->th.th_team_master;
1408 #ifdef KMP_DEBUG
1409  if (master_thread != team->t.t_threads[0]) {
1410  __kmp_print_structure();
1411  }
1412 #endif /* KMP_DEBUG */
1413  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1414  KMP_MB();
1415 
1416  // Verify state
1417  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1418  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1419  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1420  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1421  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1422 
1423  ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
1424 #if OMPT_SUPPORT
1425 #if OMPT_TRACE
1426  if (ompt_enabled &&
1427  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1428  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1429  team->t.ompt_team_info.parallel_id,
1430  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1431  }
1432 #endif
1433  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1434 #endif
1435 
1436  if (__kmp_tasking_mode == tskm_extra_barrier) {
1437  __kmp_tasking_barrier(team, this_thr, gtid);
1438  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1439  }
1440 # ifdef KMP_DEBUG
1441  if (__kmp_tasking_mode != tskm_immediate_exec) {
1442  KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1443  __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1444  this_thr->th.th_task_team));
1445  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1446  }
1447 # endif /* KMP_DEBUG */
1448 
1449  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1450  team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1451  down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1452  since the values are not used by __kmp_wait_template() in that case. */
1453  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1454 #if KMP_USE_MONITOR
1455  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1456 #endif
1457  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1458  }
1459 
1460 #if USE_ITT_BUILD
1461  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1462  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1463 #endif /* USE_ITT_BUILD */
1464 
1465  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1466  case bp_hyper_bar: {
1467  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1468  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1469  USE_ITT_BUILD_ARG(itt_sync_obj) );
1470  break;
1471  }
1472  case bp_hierarchical_bar: {
1473  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1474  USE_ITT_BUILD_ARG(itt_sync_obj) );
1475  break;
1476  }
1477  case bp_tree_bar: {
1478  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1479  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1480  USE_ITT_BUILD_ARG(itt_sync_obj) );
1481  break;
1482  }
1483  default: {
1484  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1485  USE_ITT_BUILD_ARG(itt_sync_obj) );
1486  }
1487  }
1488 
1489  /* From this point on, the team data structure may be deallocated at any time by the
1490  master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1491  data items that need to be referenced before the end of the barrier should be moved to
1492  the kmp_task_team_t structs. */
1493  if (KMP_MASTER_TID(tid)) {
1494  if (__kmp_tasking_mode != tskm_immediate_exec) {
1495  __kmp_task_team_wait(this_thr, team
1496  USE_ITT_BUILD_ARG(itt_sync_obj) );
1497  }
1498 #if KMP_STATS_ENABLED
1499  // Have master thread flag the workers to indicate they are now waiting for
1500  // next parallel region, Also wake them up so they switch their timers to idle.
1501  for (int i=0; i<team->t.t_nproc; ++i) {
1502  kmp_info_t* team_thread = team->t.t_threads[i];
1503  if (team_thread == this_thr)
1504  continue;
1505  team_thread->th.th_stats->setIdleFlag();
1506  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL)
1507  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc);
1508  }
1509 #endif
1510 #if USE_ITT_BUILD
1511  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1512  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1513 #endif /* USE_ITT_BUILD */
1514 
1515 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1516  // Join barrier - report frame end
1517  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1518 #if OMP_40_ENABLED
1519  this_thr->th.th_teams_microtask == NULL &&
1520 #endif
1521  team->t.t_active_level == 1)
1522  {
1523  kmp_uint64 cur_time = __itt_get_timestamp();
1524  ident_t * loc = team->t.t_ident;
1525  kmp_info_t **other_threads = team->t.t_threads;
1526  int nproc = this_thr->th.th_team_nproc;
1527  int i;
1528  switch(__kmp_forkjoin_frames_mode) {
1529  case 1:
1530  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1531  break;
1532  case 2:
1533  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1534  break;
1535  case 3:
1536  if( __itt_metadata_add_ptr ) {
1537  // Initialize with master's wait time
1538  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1539  // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1540  this_thr->th.th_bar_arrive_time = 0;
1541  for (i=1; i<nproc; ++i) {
1542  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1543  other_threads[i]->th.th_bar_arrive_time = 0;
1544  }
1545  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1546  }
1547  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1548  this_thr->th.th_frame_time = cur_time;
1549  break;
1550  }
1551  }
1552 # endif /* USE_ITT_BUILD */
1553  }
1554 #if USE_ITT_BUILD
1555  else {
1556  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1557  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1558  }
1559 #endif /* USE_ITT_BUILD */
1560 
1561 #if KMP_DEBUG
1562  if (KMP_MASTER_TID(tid)) {
1563  KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1564  gtid, team_id, tid, nproc));
1565  }
1566 #endif /* KMP_DEBUG */
1567 
1568  // TODO now, mark worker threads as done so they may be disbanded
1569  KMP_MB(); // Flush all pending memory write invalidates.
1570  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1571 
1572 #if OMPT_SUPPORT
1573  if (ompt_enabled) {
1574 #if OMPT_BLAME
1575  if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1576  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1577  team->t.ompt_team_info.parallel_id,
1578  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1579  }
1580 #endif
1581 
1582  // return to default state
1583  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1584  }
1585 #endif
1586  ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1587 }
1588 
1589 
1590 // TODO release worker threads' fork barriers as we are ready instead of all at once
1591 void
1592 __kmp_fork_barrier(int gtid, int tid)
1593 {
1594  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1595  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1596  kmp_info_t *this_thr = __kmp_threads[gtid];
1597  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1598 #if USE_ITT_BUILD
1599  void * itt_sync_obj = NULL;
1600 #endif /* USE_ITT_BUILD */
1601  if (team)
1602  ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1603 
1604  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1605  gtid, (team != NULL) ? team->t.t_id : -1, tid));
1606 
1607  // th_team pointer only valid for master thread here
1608  if (KMP_MASTER_TID(tid)) {
1609 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1610  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1611  // Create itt barrier object
1612  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1613  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1614  }
1615 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1616 
1617 #ifdef KMP_DEBUG
1618  register kmp_info_t **other_threads = team->t.t_threads;
1619  register int i;
1620 
1621  // Verify state
1622  KMP_MB();
1623 
1624  for(i=1; i<team->t.t_nproc; ++i) {
1625  KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1626  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1627  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1628  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1629  KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1630  & ~(KMP_BARRIER_SLEEP_STATE))
1631  == KMP_INIT_BARRIER_STATE);
1632  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1633  }
1634 #endif
1635 
1636  if (__kmp_tasking_mode != tskm_immediate_exec) {
1637  __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1
1638  }
1639 
1640  /* The master thread may have changed its blocktime between the join barrier and the
1641  fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1642  access it when the team struct is not guaranteed to exist. */
1643  // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1644  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1645 #if KMP_USE_MONITOR
1646  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1647 #endif
1648  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1649  }
1650  } // master
1651 
1652  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1653  case bp_hyper_bar: {
1654  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1655  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1656  USE_ITT_BUILD_ARG(itt_sync_obj) );
1657  break;
1658  }
1659  case bp_hierarchical_bar: {
1660  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1661  USE_ITT_BUILD_ARG(itt_sync_obj) );
1662  break;
1663  }
1664  case bp_tree_bar: {
1665  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1666  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1667  USE_ITT_BUILD_ARG(itt_sync_obj) );
1668  break;
1669  }
1670  default: {
1671  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1672  USE_ITT_BUILD_ARG(itt_sync_obj) );
1673  }
1674  }
1675 
1676  // Early exit for reaping threads releasing forkjoin barrier
1677  if (TCR_4(__kmp_global.g.g_done)) {
1678  this_thr->th.th_task_team = NULL;
1679 
1680 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1681  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1682  if (!KMP_MASTER_TID(tid)) {
1683  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1684  if (itt_sync_obj)
1685  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1686  }
1687  }
1688 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1689  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1690  return;
1691  }
1692 
1693  /* We can now assume that a valid team structure has been allocated by the master and
1694  propagated to all worker threads. The current thread, however, may not be part of the
1695  team, so we can't blindly assume that the team pointer is non-null. */
1696  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1697  KMP_DEBUG_ASSERT(team != NULL);
1698  tid = __kmp_tid_from_gtid(gtid);
1699 
1700 
1701 #if KMP_BARRIER_ICV_PULL
1702  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1703  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1704  this data before this function is called. We cannot modify __kmp_fork_call() to look at
1705  the fixed ICVs in the master's thread struct, because it is not always the case that the
1706  threads arrays have been allocated when __kmp_fork_call() is executed. */
1707  {
1708  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1709  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1710  // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1711  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1712  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1713  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1714  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1715  }
1716  }
1717 #endif // KMP_BARRIER_ICV_PULL
1718 
1719  if (__kmp_tasking_mode != tskm_immediate_exec) {
1720  __kmp_task_team_sync(this_thr, team);
1721  }
1722 
1723 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1724  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1725  if (proc_bind == proc_bind_intel) {
1726 #endif
1727 #if KMP_AFFINITY_SUPPORTED
1728  // Call dynamic affinity settings
1729  if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1730  __kmp_balanced_affinity(tid, team->t.t_nproc);
1731  }
1732 #endif // KMP_AFFINITY_SUPPORTED
1733 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1734  }
1735  else if (proc_bind != proc_bind_false) {
1736  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1737  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1738  __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1739  }
1740  else {
1741  __kmp_affinity_set_place(gtid);
1742  }
1743  }
1744 #endif
1745 
1746 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1747  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1748  if (!KMP_MASTER_TID(tid)) {
1749  // Get correct barrier object
1750  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1751  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1752  } // (prepare called inside barrier_release)
1753  }
1754 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1755  ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
1756  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1757 }
1758 
1759 
1760 void
1761 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1762 {
1763  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
1764 
1765  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1766  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1767 
1768  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1769  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1770  this data before this function is called. */
1771 #if KMP_BARRIER_ICV_PULL
1772  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1773  all of the worker threads can access them and make their own copies after the barrier. */
1774  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1775  copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1776  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1777  0, team->t.t_threads[0], team));
1778 #elif KMP_BARRIER_ICV_PUSH
1779  // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1780  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1781  0, team->t.t_threads[0], team));
1782 #else
1783  // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1784  ngo_load(new_icvs);
1785  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1786  for (int f=1; f<new_nproc; ++f) { // Skip the master thread
1787  // TODO: GEH - pass in better source location info since usually NULL here
1788  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1789  f, team->t.t_threads[f], team));
1790  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1791  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1792  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1793  f, team->t.t_threads[f], team));
1794  }
1795  ngo_sync();
1796 #endif // KMP_BARRIER_ICV_PULL
1797 }
Definition: kmp.h:200