LLVM OpenMP* Runtime Library
kmp_stats.h
1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
8 //===----------------------------------------------------------------------===//
9 //
10 // The LLVM Compiler Infrastructure
11 //
12 // This file is dual licensed under the MIT and the University of Illinois Open
13 // Source Licenses. See LICENSE.txt for details.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "kmp_config.h"
18 #include "kmp_debug.h"
19 
20 #if KMP_STATS_ENABLED
21 /* Statistics accumulator.
22  Accumulates number of samples and computes min, max, mean, standard deviation
23  on the fly.
24 
25  Online variance calculation algorithm from
26  http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
27  */
28 
29 #include "kmp_stats_timing.h"
30 #include <limits>
31 #include <math.h>
32 #include <new> // placement new
33 #include <stdint.h>
34 #include <string>
35 #include <vector>
36 
37 /* Enable developer statistics here if you want them. They are more detailed
38  than is useful for application characterisation and are intended for the
39  runtime library developer. */
40 #define KMP_DEVELOPER_STATS 0
41 
42 /* Enable/Disable histogram output */
43 #define KMP_STATS_HIST 0
44 
51  noTotal = 1 << 0,
52  onlyInMaster = 1 << 1,
53  noUnits = 1 << 2,
54  notInMaster = 1 << 3,
55  logEvent = 1 << 4
56 };
58 
65  IDLE,
66  SERIAL_REGION,
67  FORK_JOIN_BARRIER,
68  PLAIN_BARRIER,
69  TASKWAIT,
70  TASKYIELD,
71  TASKGROUP,
72  IMPLICIT_TASK,
73  EXPLICIT_TASK
74 };
75 
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg) \
96  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97  macro(OMP_NESTED_PARALLEL, 0, arg) \
98  macro(OMP_LOOP_STATIC, 0, arg) \
99  macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100  macro(OMP_LOOP_DYNAMIC, 0, arg) \
101  macro(OMP_DISTRIBUTE, 0, arg) \
102  macro(OMP_BARRIER, 0, arg) \
103  macro(OMP_CRITICAL, 0, arg) \
104  macro(OMP_SINGLE, 0, arg) \
105  macro(OMP_MASTER, 0, arg) \
106  macro(OMP_TEAMS, 0, arg) \
107  macro(OMP_set_lock, 0, arg) \
108  macro(OMP_test_lock, 0, arg) \
109  macro(REDUCE_wait, 0, arg) \
110  macro(REDUCE_nowait, 0, arg) \
111  macro(OMP_TASKYIELD, 0, arg) \
112  macro(OMP_TASKLOOP, 0, arg) \
113  macro(TASK_executed, 0, arg) \
114  macro(TASK_cancelled, 0, arg) \
115  macro(TASK_stolen, 0, arg)
116 // clang-format on
117 
136 // clang-format off
137 #define KMP_FOREACH_TIMER(macro, arg) \
138  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
139  macro (OMP_parallel, stats_flags_e::logEvent, arg) \
140  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
141  macro (OMP_loop_static, 0, arg) \
142  macro (OMP_loop_static_scheduling, 0, arg) \
143  macro (OMP_loop_dynamic, 0, arg) \
144  macro (OMP_loop_dynamic_scheduling, 0, arg) \
145  macro (OMP_critical, 0, arg) \
146  macro (OMP_critical_wait, 0, arg) \
147  macro (OMP_single, 0, arg) \
148  macro (OMP_master, 0, arg) \
149  macro (OMP_task_immediate, 0, arg) \
150  macro (OMP_task_taskwait, 0, arg) \
151  macro (OMP_task_taskyield, 0, arg) \
152  macro (OMP_task_taskgroup, 0, arg) \
153  macro (OMP_task_join_bar, 0, arg) \
154  macro (OMP_task_plain_bar, 0, arg) \
155  macro (OMP_taskloop_scheduling, 0, arg) \
156  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
157  macro (OMP_idle, stats_flags_e::logEvent, arg) \
158  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
159  macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
160  macro (OMP_serial, stats_flags_e::logEvent, arg) \
161  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
162  arg) \
163  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
164  arg) \
165  macro (OMP_loop_static_iterations, \
166  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
167  macro (OMP_loop_dynamic_iterations, \
168  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
169  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
170 // clang-format on
171 
172 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
173 // initializing OpenMP or being created by a master)
174 // until the thread is destroyed
175 // OMP_parallel -- Time thread spends executing work directly
176 // within a #pragma omp parallel
177 // OMP_parallel_overhead -- Time thread spends setting up a parallel region
178 // OMP_loop_static -- Time thread spends executing loop iterations from
179 // a statically scheduled loop
180 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
181 // from a statically scheduled loop
182 // OMP_loop_dynamic -- Time thread spends executing loop iterations from
183 // a dynamically scheduled loop
184 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
185 // from a dynamically scheduled loop
186 // OMP_critical -- Time thread spends executing critical section
187 // OMP_critical_wait -- Time thread spends waiting to enter
188 // a critcal seciton
189 // OMP_single -- Time spent executing a "single" region
190 // OMP_master -- Time spent executing a "master" region
191 // OMP_task_immediate -- Time spent executing non-deferred tasks
192 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait
193 // construct
194 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield
195 // construct
196 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
197 // construct
198 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier
199 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier
200 // construct
201 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
202 // construct
203 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
204 // inside implicit barrier at end of worksharing
205 // construct
206 // OMP_idle -- Time worker threads spend waiting for next
207 // parallel region
208 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
209 // parallel region
210 // OMP_join_barrier -- Time spent in a the join barrier surrounding a
211 // parallel region
212 // OMP_serial -- Time thread zero spends executing serial code
213 // OMP_set_numthreads -- Values passed to omp_set_num_threads
214 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region
215 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
216 // statically scheduled loops
217 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
218 // dynamically scheduled loops
219 
220 #if (KMP_DEVELOPER_STATS)
221 // Timers which are of interest to runtime library developers, not end users.
222 // These have to be explicitly enabled in addition to the other stats.
223 
224 // KMP_fork_barrier -- time in __kmp_fork_barrier
225 // KMP_join_barrier -- time in __kmp_join_barrier
226 // KMP_barrier -- time in __kmp_barrier
227 // KMP_end_split_barrier -- time in __kmp_end_split_barrier
228 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
229 // KMP_icv_copy -- start/stop timer for any ICV copying
230 // KMP_linear_gather -- time in __kmp_linear_barrier_gather
231 // KMP_linear_release -- time in __kmp_linear_barrier_release
232 // KMP_tree_gather -- time in __kmp_tree_barrier_gather
233 // KMP_tree_release -- time in __kmp_tree_barrier_release
234 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
235 // KMP_hyper_release -- time in __kmp_hyper_barrier_release
236 // clang-format off
237 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
238  macro(KMP_fork_call, 0, arg) \
239  macro(KMP_join_call, 0, arg) \
240  macro(KMP_end_split_barrier, 0, arg) \
241  macro(KMP_hier_gather, 0, arg) \
242  macro(KMP_hier_release, 0, arg) \
243  macro(KMP_hyper_gather, 0, arg) \
244  macro(KMP_hyper_release, 0, arg) \
245  macro(KMP_linear_gather, 0, arg) \
246  macro(KMP_linear_release, 0, arg) \
247  macro(KMP_tree_gather, 0, arg) \
248  macro(KMP_tree_release, 0, arg) \
249  macro(USER_resume, 0, arg) \
250  macro(USER_suspend, 0, arg) \
251  macro(KMP_allocate_team, 0, arg) \
252  macro(KMP_setup_icv_copy, 0, arg) \
253  macro(USER_icv_copy, 0, arg) \
254  macro (FOR_static_steal_stolen, \
255  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
256  macro (FOR_static_steal_chunks, \
257  stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
258 #else
259 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
260 #endif
261 // clang-format on
262 
282 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
283 
284 #define ENUMERATE(name, ignore, prefix) prefix##name,
285 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
286 
287 enum explicit_timer_e {
288  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
289 };
290 
291 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
292 #undef ENUMERATE
293 
294 /*
295  * A logarithmic histogram. It accumulates the number of values in each power of
296  * ten bin. So 1<=x<10, 10<=x<100, ...
297  * Mostly useful where we have some big outliers and want to see information
298  * about them.
299  */
300 class logHistogram {
301  enum {
302  numBins = 31, /* Number of powers of 10. If this changes you need to change
303  * the initializer for binMax */
304 
305  /*
306  * If you want to use this to analyse values that may be less than 1, (for
307  * instance times in s), then the logOffset gives you negative powers.
308  * In our case here, we're just looking at times in ticks, or counts, so we
309  * can never see values with magnitude < 1 (other than zero), so we can set
310  * it to 0. As above change the initializer if you change this.
311  */
312  logOffset = 0
313  };
314  uint32_t KMP_ALIGN_CACHE zeroCount;
315  struct {
316  uint32_t count;
317  double total;
318  } bins[numBins];
319 
320  static double binMax[numBins];
321 
322 #ifdef KMP_DEBUG
323  uint64_t _total;
324 
325  void check() const {
326  uint64_t t = zeroCount;
327  for (int i = 0; i < numBins; i++)
328  t += bins[i].count;
329  KMP_DEBUG_ASSERT(t == _total);
330  }
331 #else
332  void check() const {}
333 #endif
334 
335 public:
336  logHistogram() { reset(); }
337 
338  logHistogram(logHistogram const &o) {
339  for (int i = 0; i < numBins; i++)
340  bins[i] = o.bins[i];
341 #ifdef KMP_DEBUG
342  _total = o._total;
343 #endif
344  }
345 
346  void reset() {
347  zeroCount = 0;
348  for (int i = 0; i < numBins; i++) {
349  bins[i].count = 0;
350  bins[i].total = 0;
351  }
352 
353 #ifdef KMP_DEBUG
354  _total = 0;
355 #endif
356  }
357  uint32_t count(int b) const { return bins[b + logOffset].count; }
358  double total(int b) const { return bins[b + logOffset].total; }
359  static uint32_t findBin(double sample);
360 
361  logHistogram &operator+=(logHistogram const &o) {
362  zeroCount += o.zeroCount;
363  for (int i = 0; i < numBins; i++) {
364  bins[i].count += o.bins[i].count;
365  bins[i].total += o.bins[i].total;
366  }
367 #ifdef KMP_DEBUG
368  _total += o._total;
369  check();
370 #endif
371 
372  return *this;
373  }
374 
375  void addSample(double sample);
376  int minBin() const;
377  int maxBin() const;
378 
379  std::string format(char) const;
380 };
381 
382 class statistic {
383  double KMP_ALIGN_CACHE minVal;
384  double maxVal;
385  double meanVal;
386  double m2;
387  uint64_t sampleCount;
388  double offset;
389  bool collectingHist;
390  logHistogram hist;
391 
392 public:
393  statistic(bool doHist = bool(KMP_STATS_HIST)) {
394  reset();
395  collectingHist = doHist;
396  }
397  statistic(statistic const &o)
398  : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
399  sampleCount(o.sampleCount), offset(o.offset),
400  collectingHist(o.collectingHist), hist(o.hist) {}
401  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
402  : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
403  sampleCount(sc), offset(0.0), collectingHist(false) {}
404  bool haveHist() const { return collectingHist; }
405  double getMin() const { return minVal; }
406  double getMean() const { return meanVal; }
407  double getMax() const { return maxVal; }
408  uint64_t getCount() const { return sampleCount; }
409  double getSD() const { return sqrt(m2 / sampleCount); }
410  double getTotal() const { return sampleCount * meanVal; }
411  logHistogram const *getHist() const { return &hist; }
412  void setOffset(double d) { offset = d; }
413 
414  void reset() {
415  minVal = std::numeric_limits<double>::max();
416  maxVal = -minVal;
417  meanVal = 0.0;
418  m2 = 0.0;
419  sampleCount = 0;
420  offset = 0.0;
421  hist.reset();
422  }
423  void addSample(double sample);
424  void scale(double factor);
425  void scaleDown(double f) { scale(1. / f); }
426  void forceCount(uint64_t count) { sampleCount = count; }
427  statistic &operator+=(statistic const &other);
428 
429  std::string format(char unit, bool total = false) const;
430  std::string formatHist(char unit) const { return hist.format(unit); }
431 };
432 
433 struct statInfo {
434  const char *name;
435  uint32_t flags;
436 };
437 
438 class timeStat : public statistic {
439  static statInfo timerInfo[];
440 
441 public:
442  timeStat() : statistic() {}
443  static const char *name(timer_e e) { return timerInfo[e].name; }
444  static bool noTotal(timer_e e) {
445  return timerInfo[e].flags & stats_flags_e::noTotal;
446  }
447  static bool masterOnly(timer_e e) {
448  return timerInfo[e].flags & stats_flags_e::onlyInMaster;
449  }
450  static bool workerOnly(timer_e e) {
451  return timerInfo[e].flags & stats_flags_e::notInMaster;
452  }
453  static bool noUnits(timer_e e) {
454  return timerInfo[e].flags & stats_flags_e::noUnits;
455  }
456  static bool logEvent(timer_e e) {
457  return timerInfo[e].flags & stats_flags_e::logEvent;
458  }
459  static void clearEventFlags() {
460  for (int i = 0; i < TIMER_LAST; i++) {
461  timerInfo[i].flags &= (~(stats_flags_e::logEvent));
462  }
463  }
464 };
465 
466 // Where we need explicitly to start and end the timer, this version can be used
467 // Since these timers normally aren't nicely scoped, so don't have a good place
468 // to live on the stack of the thread, they're more work to use.
469 class explicitTimer {
470  timeStat *stat;
471  timer_e timerEnumValue;
472  tsc_tick_count startTime;
473  tsc_tick_count pauseStartTime;
474  tsc_tick_count::tsc_interval_t totalPauseTime;
475 
476 public:
477  explicitTimer(timeStat *s, timer_e te)
478  : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
479  totalPauseTime() {}
480 
481  // void setStat(timeStat *s) { stat = s; }
482  void start(tsc_tick_count tick);
483  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
484  void resume(tsc_tick_count tick) {
485  totalPauseTime += (tick - pauseStartTime);
486  }
487  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
488  void reset() {
489  startTime = 0;
490  pauseStartTime = 0;
491  totalPauseTime = 0;
492  }
493  timer_e get_type() const { return timerEnumValue; }
494 };
495 
496 // Where you need to partition a threads clock ticks into separate states
497 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
498 // DOING_NOTHING would render these conditions:
499 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
500 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
501 // versa
502 class partitionedTimers {
503 private:
504  std::vector<explicitTimer> timer_stack;
505 
506 public:
507  partitionedTimers();
508  void init(explicitTimer timer);
509  void exchange(explicitTimer timer);
510  void push(explicitTimer timer);
511  void pop();
512  void windup();
513 };
514 
515 // Special wrapper around the partioned timers to aid timing code blocks
516 // It avoids the need to have an explicit end, leaving the scope suffices.
517 class blockPartitionedTimer {
518  partitionedTimers *part_timers;
519 
520 public:
521  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
522  : part_timers(pt) {
523  part_timers->push(timer);
524  }
525  ~blockPartitionedTimer() { part_timers->pop(); }
526 };
527 
528 // Special wrapper around the thread state to aid in keeping state in code
529 // blocks It avoids the need to have an explicit end, leaving the scope
530 // suffices.
531 class blockThreadState {
532  stats_state_e *state_pointer;
533  stats_state_e old_state;
534 
535 public:
536  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
537  : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
538  *state_pointer = new_state;
539  }
540  ~blockThreadState() { *state_pointer = old_state; }
541 };
542 
543 // If all you want is a count, then you can use this...
544 // The individual per-thread counts will be aggregated into a statistic at
545 // program exit.
546 class counter {
547  uint64_t value;
548  static const statInfo counterInfo[];
549 
550 public:
551  counter() : value(0) {}
552  void increment() { value++; }
553  uint64_t getValue() const { return value; }
554  void reset() { value = 0; }
555  static const char *name(counter_e e) { return counterInfo[e].name; }
556  static bool masterOnly(counter_e e) {
557  return counterInfo[e].flags & stats_flags_e::onlyInMaster;
558  }
559 };
560 
561 /* ****************************************************************
562  Class to implement an event
563 
564  There are four components to an event: start time, stop time
565  nest_level, and timer_name.
566  The start and stop time should be obvious (recorded in clock ticks).
567  The nest_level relates to the bar width in the timeline graph.
568  The timer_name is used to determine which timer event triggered this event.
569 
570  the interface to this class is through four read-only operations:
571  1) getStart() -- returns the start time as 64 bit integer
572  2) getStop() -- returns the stop time as 64 bit integer
573  3) getNestLevel() -- returns the nest level of the event
574  4) getTimerName() -- returns the timer name that triggered event
575 
576  *MORE ON NEST_LEVEL*
577  The nest level is used in the bar graph that represents the timeline.
578  Its main purpose is for showing how events are nested inside eachother.
579  For example, say events, A, B, and C are recorded. If the timeline
580  looks like this:
581 
582 Begin -------------------------------------------------------------> Time
583  | | | | | |
584  A B C C B A
585  start start start end end end
586 
587  Then A, B, C will have a nest level of 1, 2, 3 respectively.
588  These values are then used to calculate the barwidth so you can
589  see that inside A, B has occurred, and inside B, C has occurred.
590  Currently, this is shown with A's bar width being larger than B's
591  bar width, and B's bar width being larger than C's bar width.
592 
593 **************************************************************** */
594 class kmp_stats_event {
595  uint64_t start;
596  uint64_t stop;
597  int nest_level;
598  timer_e timer_name;
599 
600 public:
601  kmp_stats_event()
602  : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
603  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
604  : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
605  inline uint64_t getStart() const { return start; }
606  inline uint64_t getStop() const { return stop; }
607  inline int getNestLevel() const { return nest_level; }
608  inline timer_e getTimerName() const { return timer_name; }
609 };
610 
611 /* ****************************************************************
612  Class to implement a dynamically expandable array of events
613 
614  ---------------------------------------------------------
615  | event 1 | event 2 | event 3 | event 4 | ... | event N |
616  ---------------------------------------------------------
617 
618  An event is pushed onto the back of this array at every
619  explicitTimer->stop() call. The event records the thread #,
620  start time, stop time, and nest level related to the bar width.
621 
622  The event vector starts at size INIT_SIZE and grows (doubles in size)
623  if needed. An implication of this behavior is that log(N)
624  reallocations are needed (where N is number of events). If you want
625  to avoid reallocations, then set INIT_SIZE to a large value.
626 
627  the interface to this class is through six operations:
628  1) reset() -- sets the internal_size back to 0 but does not deallocate any
629  memory
630  2) size() -- returns the number of valid elements in the vector
631  3) push_back(start, stop, nest, timer_name) -- pushes an event onto
632  the back of the array
633  4) deallocate() -- frees all memory associated with the vector
634  5) sort() -- sorts the vector by start time
635  6) operator[index] or at(index) -- returns event reference at that index
636 **************************************************************** */
637 class kmp_stats_event_vector {
638  kmp_stats_event *events;
639  int internal_size;
640  int allocated_size;
641  static const int INIT_SIZE = 1024;
642 
643 public:
644  kmp_stats_event_vector() {
645  events =
646  (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
647  internal_size = 0;
648  allocated_size = INIT_SIZE;
649  }
650  ~kmp_stats_event_vector() {}
651  inline void reset() { internal_size = 0; }
652  inline int size() const { return internal_size; }
653  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
654  timer_e name) {
655  int i;
656  if (internal_size == allocated_size) {
657  kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
658  sizeof(kmp_stats_event) * allocated_size * 2);
659  for (i = 0; i < internal_size; i++)
660  tmp[i] = events[i];
661  __kmp_free(events);
662  events = tmp;
663  allocated_size *= 2;
664  }
665  events[internal_size] =
666  kmp_stats_event(start_time, stop_time, nest_level, name);
667  internal_size++;
668  return;
669  }
670  void deallocate();
671  void sort();
672  const kmp_stats_event &operator[](int index) const { return events[index]; }
673  kmp_stats_event &operator[](int index) { return events[index]; }
674  const kmp_stats_event &at(int index) const { return events[index]; }
675  kmp_stats_event &at(int index) { return events[index]; }
676 };
677 
678 /* ****************************************************************
679  Class to implement a doubly-linked, circular, statistics list
680 
681  |---| ---> |---| ---> |---| ---> |---| ---> ... next
682  | | | | | | | |
683  |---| <--- |---| <--- |---| <--- |---| <--- ... prev
684  Sentinel first second third
685  Node node node node
686 
687  The Sentinel Node is the user handle on the list.
688  The first node corresponds to thread 0's statistics.
689  The second node corresponds to thread 1's statistics and so on...
690 
691  Each node has a _timers, _counters, and _explicitTimers array to hold that
692  thread's statistics. The _explicitTimers point to the correct _timer and
693  update its statistics at every stop() call. The explicitTimers' pointers are
694  set up in the constructor. Each node also has an event vector to hold that
695  thread's timing events. The event vector expands as necessary and records
696  the start-stop times for each timer.
697 
698  The nestLevel variable is for plotting events and is related
699  to the bar width in the timeline graph.
700 
701  Every thread will have a thread local pointer to its node in
702  the list. The sentinel node is used by the master thread to
703  store "dummy" statistics before __kmp_create_worker() is called.
704 **************************************************************** */
705 class kmp_stats_list {
706  int gtid;
707  timeStat _timers[TIMER_LAST + 1];
708  counter _counters[COUNTER_LAST + 1];
709  explicitTimer thread_life_timer;
710  partitionedTimers _partitionedTimers;
711  int _nestLevel; // one per thread
712  kmp_stats_event_vector _event_vector;
713  kmp_stats_list *next;
714  kmp_stats_list *prev;
715  stats_state_e state;
716  int thread_is_idle_flag;
717 
718 public:
719  kmp_stats_list()
720  : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
721  TIMER_OMP_worker_thread_life),
722  _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
723  thread_is_idle_flag(0) {}
724  ~kmp_stats_list() {}
725  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
726  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
727  inline partitionedTimers *getPartitionedTimers() {
728  return &_partitionedTimers;
729  }
730  inline timeStat *getTimers() { return _timers; }
731  inline counter *getCounters() { return _counters; }
732  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
733  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
734  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
735  inline void resetEventVector() { _event_vector.reset(); }
736  inline void incrementNestValue() { _nestLevel++; }
737  inline int getNestValue() { return _nestLevel; }
738  inline void decrementNestValue() { _nestLevel--; }
739  inline int getGtid() const { return gtid; }
740  inline void setGtid(int newgtid) { gtid = newgtid; }
741  inline void setState(stats_state_e newstate) { state = newstate; }
742  inline stats_state_e getState() const { return state; }
743  inline stats_state_e *getStatePointer() { return &state; }
744  inline bool isIdle() { return thread_is_idle_flag == 1; }
745  inline void setIdleFlag() { thread_is_idle_flag = 1; }
746  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
747  kmp_stats_list *push_back(int gtid); // returns newly created list node
748  inline void push_event(uint64_t start_time, uint64_t stop_time,
749  int nest_level, timer_e name) {
750  _event_vector.push_back(start_time, stop_time, nest_level, name);
751  }
752  void deallocate();
753  class iterator;
754  kmp_stats_list::iterator begin();
755  kmp_stats_list::iterator end();
756  int size();
757  class iterator {
758  kmp_stats_list *ptr;
759  friend kmp_stats_list::iterator kmp_stats_list::begin();
760  friend kmp_stats_list::iterator kmp_stats_list::end();
761 
762  public:
763  iterator();
764  ~iterator();
765  iterator operator++();
766  iterator operator++(int dummy);
767  iterator operator--();
768  iterator operator--(int dummy);
769  bool operator!=(const iterator &rhs);
770  bool operator==(const iterator &rhs);
771  kmp_stats_list *operator*() const; // dereference operator
772  };
773 };
774 
775 /* ****************************************************************
776  Class to encapsulate all output functions and the environment variables
777 
778  This module holds filenames for various outputs (normal stats, events, plot
779  file), as well as coloring information for the plot file.
780 
781  The filenames and flags variables are read from environment variables.
782  These are read once by the constructor of the global variable
783  __kmp_stats_output which calls init().
784 
785  During this init() call, event flags for the timeStat::timerInfo[] global
786  array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
787 
788  The only interface function that is public is outputStats(heading). This
789  function should print out everything it needs to, either to files or stderr,
790  depending on the environment variables described below
791 
792  ENVIRONMENT VARIABLES:
793  KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
794  file, otherwise, print to stderr
795  KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
796  either KMP_STATS_FILE or stderr
797  KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
798  otherwise, the plot file is sent to "events.plt"
799  KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
800  events
801  KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
802  otherwise, output is sent to "events.dat"
803 **************************************************************** */
804 class kmp_stats_output_module {
805 
806 public:
807  struct rgb_color {
808  float r;
809  float g;
810  float b;
811  };
812 
813 private:
814  std::string outputFileName;
815  static const char *eventsFileName;
816  static const char *plotFileName;
817  static int printPerThreadFlag;
818  static int printPerThreadEventsFlag;
819  static const rgb_color globalColorArray[];
820  static rgb_color timerColorInfo[];
821 
822  void init();
823  static void setupEventColors();
824  static void printPloticusFile();
825  static void printHeaderInfo(FILE *statsOut);
826  static void printTimerStats(FILE *statsOut, statistic const *theStats,
827  statistic const *totalStats);
828  static void printCounterStats(FILE *statsOut, statistic const *theStats);
829  static void printCounters(FILE *statsOut, counter const *theCounters);
830  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
831  int gtid);
832  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
833  static void windupExplicitTimers();
834  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
835 
836 public:
837  kmp_stats_output_module() { init(); }
838  void outputStats(const char *heading);
839 };
840 
841 #ifdef __cplusplus
842 extern "C" {
843 #endif
844 void __kmp_stats_init();
845 void __kmp_stats_fini();
846 void __kmp_reset_stats();
847 void __kmp_output_stats(const char *);
848 void __kmp_accumulate_stats_at_exit(void);
849 // thread local pointer to stats node within list
850 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
851 // head to stats list.
852 extern kmp_stats_list *__kmp_stats_list;
853 // lock for __kmp_stats_list
854 extern kmp_tas_lock_t __kmp_stats_lock;
855 // reference start time
856 extern tsc_tick_count __kmp_stats_start_time;
857 // interface to output
858 extern kmp_stats_output_module __kmp_stats_output;
859 
860 #ifdef __cplusplus
861 }
862 #endif
863 
864 // Simple, standard interfaces that drop out completely if stats aren't enabled
865 
877 #define KMP_COUNT_VALUE(name, value) \
878  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
879 
890 #define KMP_COUNT_BLOCK(name) \
891  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
892 
910 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
911 
919 #define KMP_INIT_PARTITIONED_TIMERS(name) \
920  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
921  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
922 
923 #define KMP_TIME_PARTITIONED_BLOCK(name) \
924  blockPartitionedTimer __PBLOCKTIME__( \
925  __kmp_stats_thread_ptr->getPartitionedTimers(), \
926  explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
927  TIMER_##name))
928 
929 #define KMP_PUSH_PARTITIONED_TIMER(name) \
930  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
931  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
932 
933 #define KMP_POP_PARTITIONED_TIMER() \
934  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
935 
936 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
937  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
938  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
939 
940 #define KMP_SET_THREAD_STATE(state_name) \
941  __kmp_stats_thread_ptr->setState(state_name)
942 
943 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
944 
945 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \
946  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
947  state_name)
948 
956 #define KMP_RESET_STATS() __kmp_reset_stats()
957 
958 #if (KMP_DEVELOPER_STATS)
959 #define KMP_TIME_DEVELOPER_BLOCK(n) KMP_TIME_BLOCK(n)
960 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
961 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
962 #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) KMP_START_EXPLICIT_TIMER(n)
963 #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) KMP_STOP_EXPLICIT_TIMER(n)
964 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
965 #else
966 // Null definitions
967 #define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
968 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
969 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
970 #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
971 #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
972 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
973 #endif
974 
975 #else // KMP_STATS_ENABLED
976 
977 // Null definitions
978 #define KMP_TIME_BLOCK(n) ((void)0)
979 #define KMP_COUNT_VALUE(n, v) ((void)0)
980 #define KMP_COUNT_BLOCK(n) ((void)0)
981 #define KMP_START_EXPLICIT_TIMER(n) ((void)0)
982 #define KMP_STOP_EXPLICIT_TIMER(n) ((void)0)
983 
984 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
985 #define KMP_RESET_STATS() ((void)0)
986 
987 #define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
988 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
989 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
990 #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
991 #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
992 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
993 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
994 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
995 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
996 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
997 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
998 #define KMP_GET_THREAD_STATE() ((void)0)
999 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1000 #endif // KMP_STATS_ENABLED
1001 
1002 #endif // KMP_STATS_H
statistic is valid only for master
Definition: kmp_stats.h:52
statistic is valid only for non-master threads
Definition: kmp_stats.h:54
do not show a TOTAL_aggregation for this statistic
Definition: kmp_stats.h:51
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)
Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Definition: kmp_stats.h:282
statistic doesn&#39;t need units printed next to it
Definition: kmp_stats.h:53
stats_flags_e
flags to describe the statistic (timer or counter)
Definition: kmp_stats.h:50
#define KMP_FOREACH_COUNTER(macro, arg)
Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h.
Definition: kmp_stats.h:95
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:64