Halide  12.0.1
Halide compiler and libraries
Featurization.h
Go to the documentation of this file.
1 #ifndef FEATURIZATION_H
2 #define FEATURIZATION_H
3 
4 #include <cstdint>
5 #include <cstring>
6 #include <iostream>
7 
8 #include "ASLog.h"
9 
10 namespace Halide {
11 namespace Internal {
12 
13 // The algorithm-specific features. For legacy reasons these are
14 // called PipelineFeatures in the code.
16  static constexpr size_t num_features() {
17  return sizeof(PipelineFeatures) / sizeof(int);
18  }
19 
20  static constexpr uint32_t version() {
21  return 3;
22  }
23 
24  // Access them by index.
25  int &operator[](int idx) {
26  return ((int *)(this))[idx];
27  }
28 
29  int operator[](int idx) const {
30  return ((const int *)(this))[idx];
31  }
32 
33  enum class OpType {
34  Const,
35  Cast,
36  Variable,
37  Param,
38  Add,
39  Sub,
40  Mod,
41  Mul,
42  Div,
43  Min,
44  Max,
45  EQ,
46  NE,
47  LT,
48  LE,
49  And,
50  Or,
51  Not,
52  Select,
53  ImageCall, // Loads to an input buffer
54  FuncCall, // Calls to another pipeline stage
55  SelfCall, // Recursive calls from a Func to itself
56  ExternCall, // Math intrinsics, typically
57  Let,
59  };
60 
61  enum class ScalarType {
62  Bool,
63  UInt8, // or Int8
64  UInt16, // or Int16
65  UInt32, // or Int32
66  UInt64, // or Int64
67  Float,
68  Double,
70  };
71 
72  // Not fed into the network, but helps avoid printing huge numbers of zeros while debugging things
74 
76 
77  enum class AccessType {
78  LoadFunc,
79  LoadSelf,
80  LoadImage,
81  Store,
83  };
84 
85  // Finer granularity call/store node properties. These are a
86  // function of the matrix of derivatives of each arg to a
87  // call w.r.t the loop variables of the Stage. Each row of
88  // the matrix corresponds to one of the call arguments. In
89  // each case we illustrate such a call, assuming that the
90  // variables of this Func are x, y, z, and that the
91  // dimension vectorized over is the first (x).
92 
93  // Square identity matrix. f(x - 2, y + 8, z + param)
95  // Square permutation matrix. f(y + 1, z - 3, x)
97  // Each row sums to 1. Each column sums to 1 or 0. f(y, x)
99  // Each row sums to 1 or 0. Each column sums to 1. f(z, y, x, 4)
101 
102  template<typename OS>
103  void dump(OS &os) const {
104  for (int i = 0; i < (int)ScalarType::NumScalarTypes; i++) {
105  const char *type_names[] = {"Bool", "UInt8", "UInt16", "UInt32", "UInt64", "Float", "Double"};
106  // Skip printing for types not used
107  if (!types_in_use[i]) {
108  continue;
109  }
110 
111  os << " Featurization for type " << type_names[i] << "\n"
112  << " Op histogram:\n"
113  << " Constant: " << op_histogram[(int)OpType::Const][i] << "\n"
114  << " Cast: " << op_histogram[(int)OpType::Cast][i] << "\n"
115  << " Variable: " << op_histogram[(int)OpType::Variable][i] << "\n"
116  << " Param: " << op_histogram[(int)OpType::Param][i] << "\n"
117  << " Add: " << op_histogram[(int)OpType::Add][i] << "\n"
118  << " Sub: " << op_histogram[(int)OpType::Sub][i] << "\n"
119  << " Mod: " << op_histogram[(int)OpType::Mod][i] << "\n"
120  << " Mul: " << op_histogram[(int)OpType::Mul][i] << "\n"
121  << " Div: " << op_histogram[(int)OpType::Div][i] << "\n"
122  << " Min: " << op_histogram[(int)OpType::Min][i] << "\n"
123  << " Max: " << op_histogram[(int)OpType::Max][i] << "\n"
124  << " EQ: " << op_histogram[(int)OpType::EQ][i] << "\n"
125  << " NE: " << op_histogram[(int)OpType::NE][i] << "\n"
126  << " LT: " << op_histogram[(int)OpType::LT][i] << "\n"
127  << " LE: " << op_histogram[(int)OpType::LE][i] << "\n"
128  << " And: " << op_histogram[(int)OpType::And][i] << "\n"
129  << " Or: " << op_histogram[(int)OpType::Or][i] << "\n"
130  << " Not: " << op_histogram[(int)OpType::Not][i] << "\n"
131  << " Select: " << op_histogram[(int)OpType::Select][i] << "\n"
132  << " ImageCall: " << op_histogram[(int)OpType::ImageCall][i] << "\n"
133  << " FuncCall: " << op_histogram[(int)OpType::FuncCall][i] << "\n"
134  << " SelfCall: " << op_histogram[(int)OpType::SelfCall][i] << "\n"
135  << " ExternCall: " << op_histogram[(int)OpType::ExternCall][i] << "\n"
136  << " Let: " << op_histogram[(int)OpType::Let][i] << "\n"
137  << " Memory access patterns. Columns are calls to other Funcs, self-calls, input image access, and stores\n"
138  << " Pointwise: "
139  << pointwise_accesses[0][i] << " "
140  << pointwise_accesses[1][i] << " "
141  << pointwise_accesses[2][i] << " "
142  << pointwise_accesses[3][i] << "\n"
143  << " Transpose: "
144  << transpose_accesses[0][i] << " "
145  << transpose_accesses[1][i] << " "
146  << transpose_accesses[2][i] << " "
147  << transpose_accesses[3][i] << "\n"
148  << " Broadcast: "
149  << broadcast_accesses[0][i] << " "
150  << broadcast_accesses[1][i] << " "
151  << broadcast_accesses[2][i] << " "
152  << broadcast_accesses[3][i] << "\n"
153  << " Slice: "
154  << slice_accesses[0][i] << " "
155  << slice_accesses[1][i] << " "
156  << slice_accesses[2][i] << " "
157  << slice_accesses[3][i] << "\n";
158  }
159  }
160  void dump() const {
161  auto os = aslog(0);
162  dump(os);
163  }
164 };
165 
166 // The schedule-dependent portion of the featurization of a stage
168  static constexpr size_t num_features() {
169  return sizeof(ScheduleFeatures) / sizeof(double);
170  }
171 
172  static constexpr uint32_t version() {
173  return 3;
174  }
175 
176  double &operator[](int idx) {
177  return ((double *)(this))[idx];
178  }
179 
180  double operator[](int idx) const {
181  return ((const double *)(this))[idx];
182  }
183 
184  // The number of times storage for this stage is allocated. The
185  // product of outer loops at store_at site
186  double num_realizations = 0;
187 
188  // The number of times a tile of the stage is computed. The
189  // product of outer loops at compute_at site. Always at least as
190  // large as num_realizations.
191  double num_productions = 0;
192 
193  // Number of times the innermost loop happens per allocation.
195 
196  // Number of times the innermost stmt happens per tile computed.
198 
199  // The total trip count of the innermost loop over the entire program.
200  // == num_realizations * points_computed_per_realization
201  // ~= num_productions * points_computed_per_production
202  // Only approximately equal because of the simplifications made
203  // regarding the modeling of sliding window
205 
206  // The minimum number of points that are actually required to be
207  // computed to produce a correct output. Not actually a function
208  // of the schedule, but a useful reference point to see if a
209  // schedule has gone off the rails.
211 
212  // Trip count of innermost loop nest.
214 
215  // Trip count of just the pure loops in the innermost loop
216  // (i.e. excludes loops representing reductions).
218 
219  // If this is to be unrolled, what is the product of the unrolling
220  // factors.
222 
223  // The number of parallel jobs launched in the production of this
224  // stage. Always 1 unless the Func is compute_root, because we
225  // place all parallelism at the outermost level.
226  double inner_parallelism = 0;
227 
228  // The number of times this Func could be realized in parallel. 1
229  // when the Func is compute_root. Product of the containing
230  // parallel loops for other stages.
231  double outer_parallelism = 0;
232 
233  // Size of the region computed at the store_at site, measured in
234  // bytes. Does not take storage-folding optimizations into account.
236 
237  // Size of the region computed per tile (at the compute_at site),
238  // measured in bytes. This includes the effect of storage-folding,
239  // so it's a better number to look at to estimate memory usage.
241 
242  // If the stage were hypothetically scheduled at root, how much
243  // memory would it consumed. Doesn't vary w.r.t. the schedule, but
244  // a useful reference.
245  double bytes_at_root = 0;
246 
247  // Same as the above, but only measuring the extent along the
248  // innermost dimension, so that we can reason about spatial
249  // locality, cache lines, prefetchers, etc.
253 
254  // For inlined Funcs, how many calls are made to this Func total.
255  double inlined_calls = 0;
256 
257  // Number of unique bytes and unique continguous segments of
258  // memory loaded from all inputs over a single trip of the loop
259  // containing the allocation site.
262 
263  // The sum of the sizes of the allocations accessed at this
264  // site. Gives a hint as to the likely locality of it.
266 
267  // The sum of the sizes of the temporary allocations while
268  // computing one tile of this Func. Probably a good thing if it
269  // fits in cache.
270  double working_set = 0;
271 
272  // The vectorization factor (#simd lanes) to be used to compute
273  // this stage. Wasted work if it's smaller than the stage's native
274  // vector size.
275  double vector_size = 0;
276 
277  // The native vector size for the narrowest type used. Does not
278  // vary with the schedule, but a useful reference point.
279  double native_vector_size = 0;
280 
281  // Number of SIMD vectors computed
282  double num_vectors = 0;
283 
284  // Number of scalars computed (e.g. from tails of loops)
285  double num_scalars = 0;
286 
287  // The number of loads done per vector or scalar computed. Vector
288  // gathers count as a batch of scalar loads. These get amortized
289  // across unrolled blocks if some loads can be reused across the
290  // unrolled dimension.
294 
295  // The memory footprint written over one per parallel task. The
296  // union of the regions if the stage is computed at finer
297  // granularity that one parallel task of some consumer.
298  double bytes_at_task = 0;
300 
301  // The memory footprint accessed while computing a single vector.
304 
305  // The memory footprint accessed per parallel task. Only counts
306  // loads from things computed outside of that parallel task (to
307  // measure the amount of traffic coming from another core).
310 
311  // The sum of the sizes of all live allocations at various sites.
316 
317  template<typename OS>
318  void dump(OS &os) const {
319  os << " num_realizations: " << num_realizations << "\n"
320  << " num_productions: " << num_productions << "\n"
321  << " points_computed_per_realization: " << points_computed_per_realization << "\n"
322  << " points_computed_per_production: " << points_computed_per_production << "\n"
323  << " points_computed_total: " << points_computed_total << "\n"
324  << " points_computed_minimum: " << points_computed_minimum << "\n"
325  << " innermost_loop_extent: " << innermost_loop_extent << "\n"
326  << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n"
327  << " unrolled_loop_extent: " << unrolled_loop_extent << "\n"
328  << " inner_parallelism: " << inner_parallelism << "\n"
329  << " outer_parallelism: " << outer_parallelism << "\n"
330  << " bytes_at_realization: " << bytes_at_realization << "\n"
331  << " bytes_at_production: " << bytes_at_production << "\n"
332  << " bytes_at_root: " << bytes_at_root << "\n"
333  << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n"
334  << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n"
335  << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n"
336  << " inlined_calls: " << inlined_calls << "\n"
337  << " unique_bytes_read_per_realization: " << unique_bytes_read_per_realization << "\n"
338  << " unique_lines_read_per_realization: " << unique_lines_read_per_realization << "\n"
339  << " allocation_bytes_read_per_realization: " << allocation_bytes_read_per_realization << "\n"
340  << " working_set: " << working_set << "\n"
341  << " vector_size: " << vector_size << "\n"
342  << " native_vector_size: " << native_vector_size << "\n"
343  << " num_vectors: " << num_vectors << "\n"
344  << " num_scalars: " << num_scalars << "\n"
345  << " scalar_loads_per_vector: " << scalar_loads_per_vector << "\n"
346  << " vector_loads_per_vector: " << vector_loads_per_vector << "\n"
347  << " scalar_loads_per_scalar: " << scalar_loads_per_scalar << "\n"
348  << " bytes_at_task: " << bytes_at_task << "\n"
349  << " innermost_bytes_at_task: " << innermost_bytes_at_task << "\n"
350  << " unique_bytes_read_per_vector: " << unique_bytes_read_per_vector << "\n"
351  << " unique_lines_read_per_vector: " << unique_lines_read_per_vector << "\n"
352  << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n"
353  << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n"
354  << " working_set_at_task: " << working_set_at_task << "\n"
355  << " working_set_at_production: " << working_set_at_production << "\n"
356  << " working_set_at_realization: " << working_set_at_realization << "\n"
357  << " working_set_at_root: " << working_set_at_root << "\n";
358  }
359  void dump() const {
360  auto os = aslog(0);
361  dump(os);
362  }
363 
364  bool equal(const ScheduleFeatures &other) const {
365  const size_t n_features = ScheduleFeatures::num_features();
366  for (size_t i = 0; i < n_features; i++) {
367  if ((*this)[i] != other[i]) {
368  return false;
369  }
370  }
371  return true;
372  }
373 };
374 
375 /*
376 Some feature values cannot be cached, and need to be recomputed.
377 These intermediates allow for faster recomputation of such features.
378 */
381  double num_scalars;
384 };
385 
386 } // namespace Internal
387 } // namespace Halide
388 
389 #endif
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT32_TYPE__ uint32_t
int broadcast_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:98
int op_histogram[(int) OpType::NumOpTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:75
int pointwise_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:94
static constexpr uint32_t version()
Definition: Featurization.h:20
static constexpr size_t num_features()
Definition: Featurization.h:16
int types_in_use[(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:73
int transpose_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:96
int slice_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
bool equal(const ScheduleFeatures &other) const
double operator[](int idx) const
static constexpr uint32_t version()
static constexpr size_t num_features()