LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_affinity.h
1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18
19#if KMP_AFFINITY_SUPPORTED
20#if KMP_USE_HWLOC
21class KMPHwlocAffinity : public KMPAffinity {
22public:
23 class Mask : public KMPAffinity::Mask {
24 hwloc_cpuset_t mask;
25
26 public:
27 Mask() {
28 mask = hwloc_bitmap_alloc();
29 this->zero();
30 }
31 ~Mask() { hwloc_bitmap_free(mask); }
32 void set(int i) override { hwloc_bitmap_set(mask, i); }
33 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35 void zero() override { hwloc_bitmap_zero(mask); }
36 void copy(const KMPAffinity::Mask *src) override {
37 const Mask *convert = static_cast<const Mask *>(src);
38 hwloc_bitmap_copy(mask, convert->mask);
39 }
40 void bitwise_and(const KMPAffinity::Mask *rhs) override {
41 const Mask *convert = static_cast<const Mask *>(rhs);
42 hwloc_bitmap_and(mask, mask, convert->mask);
43 }
44 void bitwise_or(const KMPAffinity::Mask *rhs) override {
45 const Mask *convert = static_cast<const Mask *>(rhs);
46 hwloc_bitmap_or(mask, mask, convert->mask);
47 }
48 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49 int begin() const override { return hwloc_bitmap_first(mask); }
50 int end() const override { return -1; }
51 int next(int previous) const override {
52 return hwloc_bitmap_next(mask, previous);
53 }
54 int get_system_affinity(bool abort_on_error) override {
55 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56 "Illegal get affinity operation when not capable");
57 long retval =
58 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59 if (retval >= 0) {
60 return 0;
61 }
62 int error = errno;
63 if (abort_on_error) {
64 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65 }
66 return error;
67 }
68 int set_system_affinity(bool abort_on_error) const override {
69 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70 "Illegal set affinity operation when not capable");
71 long retval =
72 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73 if (retval >= 0) {
74 return 0;
75 }
76 int error = errno;
77 if (abort_on_error) {
78 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79 }
80 return error;
81 }
82#if KMP_OS_WINDOWS
83 int set_process_affinity(bool abort_on_error) const override {
84 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
85 "Illegal set process affinity operation when not capable");
86 int error = 0;
87 const hwloc_topology_support *support =
88 hwloc_topology_get_support(__kmp_hwloc_topology);
89 if (support->cpubind->set_proc_cpubind) {
90 int retval;
91 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
92 HWLOC_CPUBIND_PROCESS);
93 if (retval >= 0)
94 return 0;
95 error = errno;
96 if (abort_on_error)
97 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
98 }
99 return error;
100 }
101#endif
102 int get_proc_group() const override {
103 int group = -1;
104#if KMP_OS_WINDOWS
105 if (__kmp_num_proc_groups == 1) {
106 return 1;
107 }
108 for (int i = 0; i < __kmp_num_proc_groups; i++) {
109 // On windows, the long type is always 32 bits
110 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
111 unsigned long second_32_bits =
112 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
113 if (first_32_bits == 0 && second_32_bits == 0) {
114 continue;
115 }
116 if (group >= 0) {
117 return -1;
118 }
119 group = i;
120 }
121#endif /* KMP_OS_WINDOWS */
122 return group;
123 }
124 };
125 void determine_capable(const char *var) override {
126 const hwloc_topology_support *topology_support;
127 if (__kmp_hwloc_topology == NULL) {
128 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
129 __kmp_hwloc_error = TRUE;
130 if (__kmp_affinity_verbose)
131 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
132 }
133 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
134 __kmp_hwloc_error = TRUE;
135 if (__kmp_affinity_verbose)
136 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
137 }
138 }
139 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
140 // Is the system capable of setting/getting this thread's affinity?
141 // Also, is topology discovery possible? (pu indicates ability to discover
142 // processing units). And finally, were there no errors when calling any
143 // hwloc_* API functions?
144 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
145 topology_support->cpubind->get_thisthread_cpubind &&
146 topology_support->discovery->pu && !__kmp_hwloc_error) {
147 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
148 KMP_AFFINITY_ENABLE(TRUE);
149 } else {
150 // indicate that hwloc didn't work and disable affinity
151 __kmp_hwloc_error = TRUE;
152 KMP_AFFINITY_DISABLE();
153 }
154 }
155 void bind_thread(int which) override {
156 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
157 "Illegal set affinity operation when not capable");
158 KMPAffinity::Mask *mask;
159 KMP_CPU_ALLOC_ON_STACK(mask);
160 KMP_CPU_ZERO(mask);
161 KMP_CPU_SET(which, mask);
162 __kmp_set_system_affinity(mask, TRUE);
163 KMP_CPU_FREE_FROM_STACK(mask);
164 }
165 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
166 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
167 KMPAffinity::Mask *allocate_mask_array(int num) override {
168 return new Mask[num];
169 }
170 void deallocate_mask_array(KMPAffinity::Mask *array) override {
171 Mask *hwloc_array = static_cast<Mask *>(array);
172 delete[] hwloc_array;
173 }
174 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
175 int index) override {
176 Mask *hwloc_array = static_cast<Mask *>(array);
177 return &(hwloc_array[index]);
178 }
179 api_type get_api_type() const override { return HWLOC; }
180};
181#endif /* KMP_USE_HWLOC */
182
183#if KMP_OS_LINUX || KMP_OS_FREEBSD
184#if KMP_OS_LINUX
185/* On some of the older OS's that we build on, these constants aren't present
186 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
187 all systems of the same arch where they are defined, and they cannot change.
188 stone forever. */
189#include <sys/syscall.h>
190#if KMP_ARCH_X86 || KMP_ARCH_ARM
191#ifndef __NR_sched_setaffinity
192#define __NR_sched_setaffinity 241
193#elif __NR_sched_setaffinity != 241
194#error Wrong code for setaffinity system call.
195#endif /* __NR_sched_setaffinity */
196#ifndef __NR_sched_getaffinity
197#define __NR_sched_getaffinity 242
198#elif __NR_sched_getaffinity != 242
199#error Wrong code for getaffinity system call.
200#endif /* __NR_sched_getaffinity */
201#elif KMP_ARCH_AARCH64
202#ifndef __NR_sched_setaffinity
203#define __NR_sched_setaffinity 122
204#elif __NR_sched_setaffinity != 122
205#error Wrong code for setaffinity system call.
206#endif /* __NR_sched_setaffinity */
207#ifndef __NR_sched_getaffinity
208#define __NR_sched_getaffinity 123
209#elif __NR_sched_getaffinity != 123
210#error Wrong code for getaffinity system call.
211#endif /* __NR_sched_getaffinity */
212#elif KMP_ARCH_RISCV64
213#ifndef __NR_sched_setaffinity
214#define __NR_sched_setaffinity 122
215#elif __NR_sched_setaffinity != 122
216#error Wrong code for setaffinity system call.
217#endif /* __NR_sched_setaffinity */
218#ifndef __NR_sched_getaffinity
219#define __NR_sched_getaffinity 123
220#elif __NR_sched_getaffinity != 123
221#error Wrong code for getaffinity system call.
222#endif /* __NR_sched_getaffinity */
223#elif KMP_ARCH_X86_64
224#ifndef __NR_sched_setaffinity
225#define __NR_sched_setaffinity 203
226#elif __NR_sched_setaffinity != 203
227#error Wrong code for setaffinity system call.
228#endif /* __NR_sched_setaffinity */
229#ifndef __NR_sched_getaffinity
230#define __NR_sched_getaffinity 204
231#elif __NR_sched_getaffinity != 204
232#error Wrong code for getaffinity system call.
233#endif /* __NR_sched_getaffinity */
234#elif KMP_ARCH_PPC64
235#ifndef __NR_sched_setaffinity
236#define __NR_sched_setaffinity 222
237#elif __NR_sched_setaffinity != 222
238#error Wrong code for setaffinity system call.
239#endif /* __NR_sched_setaffinity */
240#ifndef __NR_sched_getaffinity
241#define __NR_sched_getaffinity 223
242#elif __NR_sched_getaffinity != 223
243#error Wrong code for getaffinity system call.
244#endif /* __NR_sched_getaffinity */
245# elif KMP_ARCH_MIPS
246# ifndef __NR_sched_setaffinity
247# define __NR_sched_setaffinity 4239
248# elif __NR_sched_setaffinity != 4239
249# error Wrong code for setaffinity system call.
250# endif /* __NR_sched_setaffinity */
251# ifndef __NR_sched_getaffinity
252# define __NR_sched_getaffinity 4240
253# elif __NR_sched_getaffinity != 4240
254# error Wrong code for getaffinity system call.
255# endif /* __NR_sched_getaffinity */
256# elif KMP_ARCH_MIPS64
257# ifndef __NR_sched_setaffinity
258# define __NR_sched_setaffinity 5195
259# elif __NR_sched_setaffinity != 5195
260# error Wrong code for setaffinity system call.
261# endif /* __NR_sched_setaffinity */
262# ifndef __NR_sched_getaffinity
263# define __NR_sched_getaffinity 5196
264# elif __NR_sched_getaffinity != 5196
265# error Wrong code for getaffinity system call.
266# endif /* __NR_sched_getaffinity */
267# else
268#error Unknown or unsupported architecture
269#endif /* KMP_ARCH_* */
270#elif KMP_OS_FREEBSD
271#include <pthread.h>
272#include <pthread_np.h>
273#endif
274class KMPNativeAffinity : public KMPAffinity {
275 class Mask : public KMPAffinity::Mask {
276 typedef unsigned long mask_t;
277 typedef decltype(__kmp_affin_mask_size) mask_size_type;
278 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
279 static const mask_t ONE = 1;
280 mask_size_type get_num_mask_types() const {
281 return __kmp_affin_mask_size / sizeof(mask_t);
282 }
283
284 public:
285 mask_t *mask;
286 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
287 ~Mask() {
288 if (mask)
289 __kmp_free(mask);
290 }
291 void set(int i) override {
292 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
293 }
294 bool is_set(int i) const override {
295 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
296 }
297 void clear(int i) override {
298 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
299 }
300 void zero() override {
301 mask_size_type e = get_num_mask_types();
302 for (mask_size_type i = 0; i < e; ++i)
303 mask[i] = (mask_t)0;
304 }
305 void copy(const KMPAffinity::Mask *src) override {
306 const Mask *convert = static_cast<const Mask *>(src);
307 mask_size_type e = get_num_mask_types();
308 for (mask_size_type i = 0; i < e; ++i)
309 mask[i] = convert->mask[i];
310 }
311 void bitwise_and(const KMPAffinity::Mask *rhs) override {
312 const Mask *convert = static_cast<const Mask *>(rhs);
313 mask_size_type e = get_num_mask_types();
314 for (mask_size_type i = 0; i < e; ++i)
315 mask[i] &= convert->mask[i];
316 }
317 void bitwise_or(const KMPAffinity::Mask *rhs) override {
318 const Mask *convert = static_cast<const Mask *>(rhs);
319 mask_size_type e = get_num_mask_types();
320 for (mask_size_type i = 0; i < e; ++i)
321 mask[i] |= convert->mask[i];
322 }
323 void bitwise_not() override {
324 mask_size_type e = get_num_mask_types();
325 for (mask_size_type i = 0; i < e; ++i)
326 mask[i] = ~(mask[i]);
327 }
328 int begin() const override {
329 int retval = 0;
330 while (retval < end() && !is_set(retval))
331 ++retval;
332 return retval;
333 }
334 int end() const override {
335 int e;
336 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
337 return e;
338 }
339 int next(int previous) const override {
340 int retval = previous + 1;
341 while (retval < end() && !is_set(retval))
342 ++retval;
343 return retval;
344 }
345 int get_system_affinity(bool abort_on_error) override {
346 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
347 "Illegal get affinity operation when not capable");
348#if KMP_OS_LINUX
349 long retval =
350 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
351#elif KMP_OS_FREEBSD
352 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
353 reinterpret_cast<cpuset_t *>(mask));
354 int retval = (r == 0 ? 0 : -1);
355#endif
356 if (retval >= 0) {
357 return 0;
358 }
359 int error = errno;
360 if (abort_on_error) {
361 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
362 }
363 return error;
364 }
365 int set_system_affinity(bool abort_on_error) const override {
366 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
367 "Illegal set affinity operation when not capable");
368#if KMP_OS_LINUX
369 long retval =
370 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
371#elif KMP_OS_FREEBSD
372 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
373 reinterpret_cast<cpuset_t *>(mask));
374 int retval = (r == 0 ? 0 : -1);
375#endif
376 if (retval >= 0) {
377 return 0;
378 }
379 int error = errno;
380 if (abort_on_error) {
381 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
382 }
383 return error;
384 }
385 };
386 void determine_capable(const char *env_var) override {
387 __kmp_affinity_determine_capable(env_var);
388 }
389 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
390 KMPAffinity::Mask *allocate_mask() override {
391 KMPNativeAffinity::Mask *retval = new Mask();
392 return retval;
393 }
394 void deallocate_mask(KMPAffinity::Mask *m) override {
395 KMPNativeAffinity::Mask *native_mask =
396 static_cast<KMPNativeAffinity::Mask *>(m);
397 delete native_mask;
398 }
399 KMPAffinity::Mask *allocate_mask_array(int num) override {
400 return new Mask[num];
401 }
402 void deallocate_mask_array(KMPAffinity::Mask *array) override {
403 Mask *linux_array = static_cast<Mask *>(array);
404 delete[] linux_array;
405 }
406 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
407 int index) override {
408 Mask *linux_array = static_cast<Mask *>(array);
409 return &(linux_array[index]);
410 }
411 api_type get_api_type() const override { return NATIVE_OS; }
412};
413#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
414
415#if KMP_OS_WINDOWS
416class KMPNativeAffinity : public KMPAffinity {
417 class Mask : public KMPAffinity::Mask {
418 typedef ULONG_PTR mask_t;
419 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
420 mask_t *mask;
421
422 public:
423 Mask() {
424 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
425 }
426 ~Mask() {
427 if (mask)
428 __kmp_free(mask);
429 }
430 void set(int i) override {
431 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
432 }
433 bool is_set(int i) const override {
434 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
435 }
436 void clear(int i) override {
437 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
438 }
439 void zero() override {
440 for (int i = 0; i < __kmp_num_proc_groups; ++i)
441 mask[i] = 0;
442 }
443 void copy(const KMPAffinity::Mask *src) override {
444 const Mask *convert = static_cast<const Mask *>(src);
445 for (int i = 0; i < __kmp_num_proc_groups; ++i)
446 mask[i] = convert->mask[i];
447 }
448 void bitwise_and(const KMPAffinity::Mask *rhs) override {
449 const Mask *convert = static_cast<const Mask *>(rhs);
450 for (int i = 0; i < __kmp_num_proc_groups; ++i)
451 mask[i] &= convert->mask[i];
452 }
453 void bitwise_or(const KMPAffinity::Mask *rhs) override {
454 const Mask *convert = static_cast<const Mask *>(rhs);
455 for (int i = 0; i < __kmp_num_proc_groups; ++i)
456 mask[i] |= convert->mask[i];
457 }
458 void bitwise_not() override {
459 for (int i = 0; i < __kmp_num_proc_groups; ++i)
460 mask[i] = ~(mask[i]);
461 }
462 int begin() const override {
463 int retval = 0;
464 while (retval < end() && !is_set(retval))
465 ++retval;
466 return retval;
467 }
468 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
469 int next(int previous) const override {
470 int retval = previous + 1;
471 while (retval < end() && !is_set(retval))
472 ++retval;
473 return retval;
474 }
475 int set_process_affinity(bool abort_on_error) const override {
476 if (__kmp_num_proc_groups <= 1) {
477 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
478 DWORD error = GetLastError();
479 if (abort_on_error) {
480 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
481 __kmp_msg_null);
482 }
483 return error;
484 }
485 }
486 return 0;
487 }
488 int set_system_affinity(bool abort_on_error) const override {
489 if (__kmp_num_proc_groups > 1) {
490 // Check for a valid mask.
491 GROUP_AFFINITY ga;
492 int group = get_proc_group();
493 if (group < 0) {
494 if (abort_on_error) {
495 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
496 }
497 return -1;
498 }
499 // Transform the bit vector into a GROUP_AFFINITY struct
500 // and make the system call to set affinity.
501 ga.Group = group;
502 ga.Mask = mask[group];
503 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
504
505 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
506 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
507 DWORD error = GetLastError();
508 if (abort_on_error) {
509 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
510 __kmp_msg_null);
511 }
512 return error;
513 }
514 } else {
515 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
516 DWORD error = GetLastError();
517 if (abort_on_error) {
518 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
519 __kmp_msg_null);
520 }
521 return error;
522 }
523 }
524 return 0;
525 }
526 int get_system_affinity(bool abort_on_error) override {
527 if (__kmp_num_proc_groups > 1) {
528 this->zero();
529 GROUP_AFFINITY ga;
530 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
531 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
532 DWORD error = GetLastError();
533 if (abort_on_error) {
534 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
535 KMP_ERR(error), __kmp_msg_null);
536 }
537 return error;
538 }
539 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
540 (ga.Mask == 0)) {
541 return -1;
542 }
543 mask[ga.Group] = ga.Mask;
544 } else {
545 mask_t newMask, sysMask, retval;
546 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
547 DWORD error = GetLastError();
548 if (abort_on_error) {
549 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
550 KMP_ERR(error), __kmp_msg_null);
551 }
552 return error;
553 }
554 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
555 if (!retval) {
556 DWORD error = GetLastError();
557 if (abort_on_error) {
558 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
559 KMP_ERR(error), __kmp_msg_null);
560 }
561 return error;
562 }
563 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
564 if (!newMask) {
565 DWORD error = GetLastError();
566 if (abort_on_error) {
567 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
568 KMP_ERR(error), __kmp_msg_null);
569 }
570 }
571 *mask = retval;
572 }
573 return 0;
574 }
575 int get_proc_group() const override {
576 int group = -1;
577 if (__kmp_num_proc_groups == 1) {
578 return 1;
579 }
580 for (int i = 0; i < __kmp_num_proc_groups; i++) {
581 if (mask[i] == 0)
582 continue;
583 if (group >= 0)
584 return -1;
585 group = i;
586 }
587 return group;
588 }
589 };
590 void determine_capable(const char *env_var) override {
591 __kmp_affinity_determine_capable(env_var);
592 }
593 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
594 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
595 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
596 KMPAffinity::Mask *allocate_mask_array(int num) override {
597 return new Mask[num];
598 }
599 void deallocate_mask_array(KMPAffinity::Mask *array) override {
600 Mask *windows_array = static_cast<Mask *>(array);
601 delete[] windows_array;
602 }
603 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
604 int index) override {
605 Mask *windows_array = static_cast<Mask *>(array);
606 return &(windows_array[index]);
607 }
608 api_type get_api_type() const override { return NATIVE_OS; }
609};
610#endif /* KMP_OS_WINDOWS */
611#endif /* KMP_AFFINITY_SUPPORTED */
612
613class kmp_hw_thread_t {
614public:
615 static const int UNKNOWN_ID = -1;
616 static int compare_ids(const void *a, const void *b);
617 static int compare_compact(const void *a, const void *b);
618 int ids[KMP_HW_LAST];
619 int sub_ids[KMP_HW_LAST];
620 bool leader;
621 int os_id;
622 void print() const;
623 void clear() {
624 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
625 ids[i] = UNKNOWN_ID;
626 leader = false;
627 }
628};
629
630class kmp_topology_t {
631
632 struct flags_t {
633 int uniform : 1;
634 int reserved : 31;
635 };
636
637 int depth;
638
639 // The following arrays are all 'depth' long
640
641 // Orderd array of the types in the topology
642 kmp_hw_t *types;
643
644 // Keep quick topology ratios, for non-uniform topologies,
645 // this ratio holds the max number of itemAs per itemB
646 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
647 int *ratio;
648
649 // Storage containing the absolute number of each topology layer
650 int *count;
651
652 // The hardware threads array
653 // hw_threads is num_hw_threads long
654 // Each hw_thread's ids and sub_ids are depth deep
655 int num_hw_threads;
656 kmp_hw_thread_t *hw_threads;
657
658 // Equivalence hash where the key is the hardware topology item
659 // and the value is the equivalent hardware topology type in the
660 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
661 // known equivalence for the topology type
662 kmp_hw_t equivalent[KMP_HW_LAST];
663
664 // Flags describing the topology
665 flags_t flags;
666
667 // Count each item & get the num x's per y
668 // e.g., get the number of cores and the number of threads per core
669 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
670 void _gather_enumeration_information();
671
672 // Remove layers that don't add information to the topology.
673 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
674 void _remove_radix1_layers();
675
676 // Find out if the topology is uniform
677 void _discover_uniformity();
678
679 // Set all the sub_ids for each hardware thread
680 void _set_sub_ids();
681
682 // Set global affinity variables describing the number of threads per
683 // core, the number of packages, the number of cores per package, and
684 // the number of cores.
685 void _set_globals();
686
687 // Set the last level cache equivalent type
688 void _set_last_level_cache();
689
690public:
691 // Force use of allocate()/deallocate()
692 kmp_topology_t() = delete;
693 kmp_topology_t(const kmp_topology_t &t) = delete;
694 kmp_topology_t(kmp_topology_t &&t) = delete;
695 kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
696 kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
697
698 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
699 static void deallocate(kmp_topology_t *);
700
701 // Functions used in create_map() routines
702 kmp_hw_thread_t &at(int index) {
703 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
704 return hw_threads[index];
705 }
706 const kmp_hw_thread_t &at(int index) const {
707 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
708 return hw_threads[index];
709 }
710 int get_num_hw_threads() const { return num_hw_threads; }
711 void sort_ids() {
712 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
713 kmp_hw_thread_t::compare_ids);
714 }
715 // Check if the hardware ids are unique, if they are
716 // return true, otherwise return false
717 bool check_ids() const;
718
719 // Function to call after the create_map() routine
720 void canonicalize();
721 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
722
723 // Functions used after canonicalize() called
724 bool filter_hw_subset();
725 bool is_close(int hwt1, int hwt2, int level) const;
726 bool is_uniform() const { return flags.uniform; }
727 // Tell whether a type is a valid type in the topology
728 // returns KMP_HW_UNKNOWN when there is no equivalent type
729 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
730 // Set type1 = type2
731 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
732 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
733 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
734 kmp_hw_t real_type2 = equivalent[type2];
735 if (real_type2 == KMP_HW_UNKNOWN)
736 real_type2 = type2;
737 equivalent[type1] = real_type2;
738 // This loop is required since any of the types may have been set to
739 // be equivalent to type1. They all must be checked and reset to type2.
740 KMP_FOREACH_HW_TYPE(type) {
741 if (equivalent[type] == type1) {
742 equivalent[type] = real_type2;
743 }
744 }
745 }
746 // Calculate number of types corresponding to level1
747 // per types corresponding to level2 (e.g., number of threads per core)
748 int calculate_ratio(int level1, int level2) const {
749 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
750 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
751 int r = 1;
752 for (int level = level1; level > level2; --level)
753 r *= ratio[level];
754 return r;
755 }
756 int get_ratio(int level) const {
757 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
758 return ratio[level];
759 }
760 int get_depth() const { return depth; };
761 kmp_hw_t get_type(int level) const {
762 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
763 return types[level];
764 }
765 int get_level(kmp_hw_t type) const {
766 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
767 int eq_type = equivalent[type];
768 if (eq_type == KMP_HW_UNKNOWN)
769 return -1;
770 for (int i = 0; i < depth; ++i)
771 if (types[i] == eq_type)
772 return i;
773 return -1;
774 }
775 int get_count(int level) const {
776 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
777 return count[level];
778 }
779#if KMP_AFFINITY_SUPPORTED
780 void sort_compact() {
781 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
782 kmp_hw_thread_t::compare_compact);
783 }
784#endif
785 void print(const char *env_var = "KMP_AFFINITY") const;
786 void dump() const;
787};
788
789class kmp_hw_subset_t {
790public:
791 struct item_t {
792 int num;
793 kmp_hw_t type;
794 int offset;
795 };
796
797private:
798 int depth;
799 int capacity;
800 item_t *items;
801 kmp_uint64 set;
802 bool absolute;
803 // The set must be able to handle up to KMP_HW_LAST number of layers
804 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
805
806public:
807 // Force use of allocate()/deallocate()
808 kmp_hw_subset_t() = delete;
809 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
810 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
811 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
812 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
813
814 static kmp_hw_subset_t *allocate() {
815 int initial_capacity = 5;
816 kmp_hw_subset_t *retval =
817 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
818 retval->depth = 0;
819 retval->capacity = initial_capacity;
820 retval->set = 0ull;
821 retval->absolute = false;
822 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
823 return retval;
824 }
825 static void deallocate(kmp_hw_subset_t *subset) {
826 __kmp_free(subset->items);
827 __kmp_free(subset);
828 }
829 void set_absolute() { absolute = true; }
830 bool is_absolute() const { return absolute; }
831 void push_back(int num, kmp_hw_t type, int offset) {
832 if (depth == capacity - 1) {
833 capacity *= 2;
834 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
835 for (int i = 0; i < depth; ++i)
836 new_items[i] = items[i];
837 __kmp_free(items);
838 items = new_items;
839 }
840 items[depth].num = num;
841 items[depth].type = type;
842 items[depth].offset = offset;
843 depth++;
844 set |= (1ull << type);
845 }
846 int get_depth() const { return depth; }
847 const item_t &at(int index) const {
848 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
849 return items[index];
850 }
851 item_t &at(int index) {
852 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
853 return items[index];
854 }
855 void remove(int index) {
856 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
857 set &= ~(1ull << items[index].type);
858 for (int j = index + 1; j < depth; ++j) {
859 items[j - 1] = items[j];
860 }
861 depth--;
862 }
863 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
864 void dump() const {
865 printf("**********************\n");
866 printf("*** kmp_hw_subset: ***\n");
867 printf("* depth: %d\n", depth);
868 printf("* items:\n");
869 for (int i = 0; i < depth; ++i) {
870 printf("num: %d, type: %s, offset: %d\n", items[i].num,
871 __kmp_hw_get_keyword(items[i].type), items[i].offset);
872 }
873 printf("* set: 0x%llx\n", set);
874 printf("* absolute: %d\n", absolute);
875 printf("**********************\n");
876 }
877};
878
879extern kmp_topology_t *__kmp_topology;
880extern kmp_hw_subset_t *__kmp_hw_subset;
881
882/* A structure for holding machine-specific hierarchy info to be computed once
883 at init. This structure represents a mapping of threads to the actual machine
884 hierarchy, or to our best guess at what the hierarchy might be, for the
885 purpose of performing an efficient barrier. In the worst case, when there is
886 no machine hierarchy information, it produces a tree suitable for a barrier,
887 similar to the tree used in the hyper barrier. */
888class hierarchy_info {
889public:
890 /* Good default values for number of leaves and branching factor, given no
891 affinity information. Behaves a bit like hyper barrier. */
892 static const kmp_uint32 maxLeaves = 4;
893 static const kmp_uint32 minBranch = 4;
899 kmp_uint32 maxLevels;
900
905 kmp_uint32 depth;
906 kmp_uint32 base_num_threads;
907 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
908 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
909 // 2=initialization in progress
910 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
911
916 kmp_uint32 *numPerLevel;
917 kmp_uint32 *skipPerLevel;
918
919 void deriveLevels() {
920 int hier_depth = __kmp_topology->get_depth();
921 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
922 numPerLevel[level] = __kmp_topology->get_ratio(i);
923 }
924 }
925
926 hierarchy_info()
927 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
928
929 void fini() {
930 if (!uninitialized && numPerLevel) {
931 __kmp_free(numPerLevel);
932 numPerLevel = NULL;
933 uninitialized = not_initialized;
934 }
935 }
936
937 void init(int num_addrs) {
938 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
939 &uninitialized, not_initialized, initializing);
940 if (bool_result == 0) { // Wait for initialization
941 while (TCR_1(uninitialized) != initialized)
942 KMP_CPU_PAUSE();
943 return;
944 }
945 KMP_DEBUG_ASSERT(bool_result == 1);
946
947 /* Added explicit initialization of the data fields here to prevent usage of
948 dirty value observed when static library is re-initialized multiple times
949 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
950 OpenMP). */
951 depth = 1;
952 resizing = 0;
953 maxLevels = 7;
954 numPerLevel =
955 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
956 skipPerLevel = &(numPerLevel[maxLevels]);
957 for (kmp_uint32 i = 0; i < maxLevels;
958 ++i) { // init numPerLevel[*] to 1 item per level
959 numPerLevel[i] = 1;
960 skipPerLevel[i] = 1;
961 }
962
963 // Sort table by physical ID
964 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
965 deriveLevels();
966 } else {
967 numPerLevel[0] = maxLeaves;
968 numPerLevel[1] = num_addrs / maxLeaves;
969 if (num_addrs % maxLeaves)
970 numPerLevel[1]++;
971 }
972
973 base_num_threads = num_addrs;
974 for (int i = maxLevels - 1; i >= 0;
975 --i) // count non-empty levels to get depth
976 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
977 depth++;
978
979 kmp_uint32 branch = minBranch;
980 if (numPerLevel[0] == 1)
981 branch = num_addrs / maxLeaves;
982 if (branch < minBranch)
983 branch = minBranch;
984 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
985 while (numPerLevel[d] > branch ||
986 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
987 if (numPerLevel[d] & 1)
988 numPerLevel[d]++;
989 numPerLevel[d] = numPerLevel[d] >> 1;
990 if (numPerLevel[d + 1] == 1)
991 depth++;
992 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
993 }
994 if (numPerLevel[0] == 1) {
995 branch = branch >> 1;
996 if (branch < 4)
997 branch = minBranch;
998 }
999 }
1000
1001 for (kmp_uint32 i = 1; i < depth; ++i)
1002 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1003 // Fill in hierarchy in the case of oversubscription
1004 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1005 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1006
1007 uninitialized = initialized; // One writer
1008 }
1009
1010 // Resize the hierarchy if nproc changes to something larger than before
1011 void resize(kmp_uint32 nproc) {
1012 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1013 while (bool_result == 0) { // someone else is trying to resize
1014 KMP_CPU_PAUSE();
1015 if (nproc <= base_num_threads) // happy with other thread's resize
1016 return;
1017 else // try to resize
1018 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1019 }
1020 KMP_DEBUG_ASSERT(bool_result != 0);
1021 if (nproc <= base_num_threads)
1022 return; // happy with other thread's resize
1023
1024 // Calculate new maxLevels
1025 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1026 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1027 // First see if old maxLevels is enough to contain new size
1028 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1029 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1030 numPerLevel[i - 1] *= 2;
1031 old_sz *= 2;
1032 depth++;
1033 }
1034 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1035 while (nproc > old_sz) {
1036 old_sz *= 2;
1037 incs++;
1038 depth++;
1039 }
1040 maxLevels += incs;
1041
1042 // Resize arrays
1043 kmp_uint32 *old_numPerLevel = numPerLevel;
1044 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1045 numPerLevel = skipPerLevel = NULL;
1046 numPerLevel =
1047 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1048 skipPerLevel = &(numPerLevel[maxLevels]);
1049
1050 // Copy old elements from old arrays
1051 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1052 // init numPerLevel[*] to 1 item per level
1053 numPerLevel[i] = old_numPerLevel[i];
1054 skipPerLevel[i] = old_skipPerLevel[i];
1055 }
1056
1057 // Init new elements in arrays to 1
1058 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1059 // init numPerLevel[*] to 1 item per level
1060 numPerLevel[i] = 1;
1061 skipPerLevel[i] = 1;
1062 }
1063
1064 // Free old arrays
1065 __kmp_free(old_numPerLevel);
1066 }
1067
1068 // Fill in oversubscription levels of hierarchy
1069 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1070 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1071
1072 base_num_threads = nproc;
1073 resizing = 0; // One writer
1074 }
1075};
1076#endif // KMP_AFFINITY_H